diff options
Diffstat (limited to 'compute')
249 files changed, 51005 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt new file mode 100644 index 000000000..aaebff758 --- /dev/null +++ b/compute/ARMComputeEx/CMakeLists.txt @@ -0,0 +1,32 @@ +nnas_find_package(ARMCompute QUIET) + +if(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: need ARM Compute library") + return() +else(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: OK") +endif(NOT ARMCompute_FOUND) + +set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR}) + +file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") + +# generate embeded cl_kernel +execute_process ( + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMAND bash -c "python resolve_includes.py" +) + +add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) +target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE}) +target_link_libraries(arm_compute_ex PRIVATE arm_compute) +target_link_libraries(arm_compute_ex PRIVATE nnfw_common) +target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage) +# Defines to enable validate check in debug build +target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS + $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED + ARM_COMPUTE_LOGGING_ENABLED>) +# Validate check functions are not used on release build +# Some parameter are used for validate check function call, and these parameter may not used on release build +target_compile_options(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:-Wno-unused-parameter -Wno-unused-function>) +install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h new file mode 100644 index 000000000..e4e752ef9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + +#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ +#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <map> +#include <set> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ +class CLKernelLibraryEx +{ + using StringSet = std::set<std::string>; + +private: + /** + * @brief Construct a new CLKernelLibraryEx object + */ + CLKernelLibraryEx(); + +public: + /** + * @brief Prevent instances of this class from being copied. + */ + CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; + + /** + * @brief Prevent instances of this class from being copied. + */ + const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance + */ + static CLKernelLibraryEx &get(); + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) + { + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); + } + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A + */ + void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files + */ + std::string get_kernel_path() { return _kernel_path; }; + + /** + * @brief Get the source of the selected program. + * @param[in] program_name Program name. + * @return Source of the selected program. + */ + std::string get_program_source(const std::string &program_name); + + /** + * @brief Set the CL context used to create programs. + * @note Setting the context also resets the device to the + * first one available in the new context. + * @param[in] context A CL context. + * @return N/A + */ + void set_context(cl::Context context) + { + _context = std::move(context); + if (_context.get() == nullptr) + { + _device = cl::Device(); + } + else + { + const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); + + if (cl_devices.empty()) + { + _device = cl::Device(); + } + else + { + _device = cl_devices[0]; + } + } + } + + /** + * @brief Return associated CL context. + * @return A CL context. + */ + cl::Context &context() { return _context; } + + /** + * @brief Set the CL device for which the programs are created. + * @param[in] device A CL device. + * @return N/A + */ + void set_device(cl::Device device) { _device = std::move(device); } + + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version + * @return The content of CL_DEVICE_VERSION + */ + std::string get_device_version(); + + /** + * @brief Create a kernel from the kernel library. + * @param[in] kernel_name Kernel name. + * @param[in] build_options_set Kernel build options as a set. + * @return The created kernel. + */ + Kernel create_kernel(const std::string &kernel_name, + const StringSet &build_options_set = {}) const; + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object + */ + + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device + */ + cl::NDRange default_ndrange() const; + + /** + * @brief Clear the library's cache of binary programs + * @return N/A + */ + void clear_programs_cache() + { + _programs_map.clear(); + _built_programs_map.clear(); + } + + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ + const std::map<std::string, cl::Program> &get_built_programs() const + { + return _built_programs_map; + } + + /** + * @brief Add a new built program to the cache + * @param[in] built_program_name Name of the program + * @param[in] program Built program to add to the cache + * @return N/A + */ + void add_built_program(const std::string &built_program_name, cl::Program program); + + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + +private: + /** + * @brief Load program and its dependencies. + * @param[in] program_name Name of the program to load. + */ + const Program &load_program(const std::string &program_name) const; + /** + * @brief Concatenates contents of a set into a single string. + * @param[in] s Input set to concatenate. + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + + cl::Context _context; /**< Underlying CL context. */ + cl::Device _device; /**< Underlying CL device. */ + std::string _kernel_path; /**< Path to the kernels folder. */ + mutable std::map<std::string, const Program> + _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, cl::Program> + _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} +#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h new file mode 100644 index 000000000..b98b174f7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgOperationKernel.h + * @brief This file defines CLArgOperationKernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the argop kernel. + */ +class CLArgOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor. + */ + CLArgOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied + */ + CLArgOperationKernel(const CLArgOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied + * @return Reference of this instance + */ + CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved + */ + CLArgOperationKernel(CLArgOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved + * @return Reference of this instance + */ + CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default; + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The output tensor, Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op); + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLArgOperationKernel + * @param[in] input An input tensor info. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] output The output tensor info, Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ArgOperation op); + + /* + * @brief Run CLArgOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..ab33d9d3a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h new file mode 100644 index 000000000..16cef0b61 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLCastKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastKernel class + */ + +#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ +#define __ARM_COMPUTE_CLCASTKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define OpenCL kernel for cast operation + */ +class CLCastKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLCastKernel object + */ + CLCastKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLCastKernel(const CLCastKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLCastKernel &operator=(const CLCastKernel &) = delete; + + /** + * @brief Construct CLCastKernel object using default move constructor + * @param[in] CLCastKernel object to move + */ + CLCastKernel(CLCastKernel &&) = default; + + /** + * @brief Allow instances of this class to be moved + * @param[in] CLCastKernel object to move + */ + CLCastKernel &operator=(CLCastKernel &&) = default; + + /** + * @brief Destruct this CLCastKernel object + */ + ~CLCastKernel() = default; + + /** + * @brief Initialise the kernel's input and output. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] input_subtype Sub data type of input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h new file mode 100644 index 000000000..60ec7a82a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform depthTospace operation */ +class CLDepthToSpaceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLDepthToSpaceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default; + /** Default destructor */ + ~CLDepthToSpaceKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..da075db69 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h new file mode 100644 index 000000000..aa81a1efa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLGatherExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherExKernel class + */ + +#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ +#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define an interface for the gather kernel. + */ +class CLGatherExKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLGatherExKernel object + * */ + CLGatherExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel(const CLGatherExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel &operator=(const CLGatherExKernel &) = delete; + + /** + * @brief Construct CLGatherExKernel object by using default move constructor + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel(CLGatherExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel &operator=(CLGatherExKernel &&) = default; + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLGatherExKernel + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_indices; + ICLTensor *_output; + int _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..8269e5a7a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups{nullptr}; /** Lookups tensor */ + const ICLTensor *_keys{nullptr}; /** Keys tensor */ + const ICLTensor *_input{nullptr}; /** Source tensor */ + ICLTensor *_output{nullptr}; /** Destination tensor */ + ICLTensor *_hits{nullptr}; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..f5e147e03 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for performing an instance normalization */ +class CLInstanceNormalizationLayerKernelEx : public ICLKernel +{ +public: + /** Constructor */ + CLInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx & + operator=(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Default Move Constructor. */ + CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default move assignment operator */ + CLInstanceNormalizationLayerKernelEx & + operator=(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~CLInstanceNormalizationLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_gamma; + ICLTensor *_beta; + float _epsilon; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..ccbea147e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h new file mode 100644 index 000000000..eff1b8bd5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__ +#define __ARM_COMPUTE_CLPRELU_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to calculate PReLU*/ +class CLPReLUKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel(const CLPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPReLUKernel &operator=(const CLPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPReLUKernel(CLPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPReLUKernel &operator=(CLPReLUKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor1. + * @param[in] alpha Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + const ICLTensor *_alpha; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..a26a4a7fc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h new file mode 100644 index 000000000..577e38cc4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */ +class CLSpaceToBatchNDKernel final : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToBatchNDKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default; + /** Default destructor */ + ~CLSpaceToBatchNDKernel() = default; + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Block size tensor. Data types supported: S32. + * @param[in] padding_size Padding size tensor. Data types supported: S32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input{nullptr}; /**< Source tensor */ + const ICLTensor *_block_size{nullptr}; /**< Block size tensor */ + const ICLTensor *_padding_size{nullptr}; /**< Padding size tensor */ + ICLTensor *_output{nullptr}; /**< Destination tensor */ +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h new file mode 100644 index 000000000..be845a549 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ +#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform spaceTodepth operation */ +class CLSpaceToDepthKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLSpaceToDepthKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default; + /** Allow instances of this class to be moved */ + CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default; + /** Default destructor */ + ~CLSpaceToDepthKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h new file mode 100644 index 000000000..8da2daecc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +// these parameters can be changed +#define _ITEMS 16 // number of items in a group +#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS +#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram +#define PERMUT // store the final permutation +//////////////////////////////////////////////////////// + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define CLTopKV2Single + */ +class CLTopKV2Single : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Single(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ + CLTopKV2Single(const CLTopKV2Single &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ + CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ + CLTopKV2Single(CLTopKV2Single &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ + CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); + + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_topk_values; + ICLTensor *_topk_indices; +}; + +/** + * @brief Class to define CLTopKV2Init + */ +class CLTopKV2Init : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Init(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ + CLTopKV2Init(const CLTopKV2Init &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ + CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ + CLTopKV2Init(CLTopKV2Init &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ + CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); + + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; +}; + +/** + * @brief Class to define CLRadixSortHistogram + */ +class CLRadixSortHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ + CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ + CLRadixSortHistogram(CLRadixSortHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + } + + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; +}; + +/** + * @brief Class to define CLRadixSortScanHistogram + */ +class CLRadixSortScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ + CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ + CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ +class CLRadixSortGlobalScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortGlobalScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ + CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ + CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); + + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortPasteHistogram + */ +class CLRadixSortPasteHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortPasteHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ + CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ + CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortReorder + */ +class CLRadixSortReorder : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortReorder(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ + CLRadixSortReorder(const CLRadixSortReorder &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ + CLRadixSortReorder(CLRadixSortReorder &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ +class CLTopKV2FindFirstNegative : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2FindFirstNegative(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ + CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ + CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ + void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } + + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_out_key_buf; +}; + +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ +class CLTopKV2ReorderNegatives : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2ReorderNegatives(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ + CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ + CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2Store + */ +class CLTopKV2Store : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Store(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ + CLTopKV2Store(const CLTopKV2Store &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ + CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ + CLTopKV2Store(CLTopKV2Store &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ + CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ + void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); + + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_values; + ICLTensor *_indices; + cl::Buffer *_out_key_buf; + cl::Buffer *_out_ind_buf; +}; + +} // namespace arm_compute +#endif // Disable GPU implementation +#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h new file mode 100644 index 000000000..c5ef730b6 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL. + */ +class CLTransposeConvLayerUpsampleKernel : public ICLKernel +{ +public: + /** Constructor */ + CLTransposeConvLayerUpsampleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsampleKernel & + operator=(const CLTransposeConvLayerUpsampleKernel &) = delete; + /** Default Move Constructor. */ + CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default; + /** Default move assignment operator */ + CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default; + /** Default destructor */ + ~CLTransposeConvLayerUpsampleKernel() = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. All but + * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only + * performed within the XY-plane. + * @param[in] inner_border Top and right inner border sizes. These rows and columns will be + * filled with zero. + * @param[in] info Contains padding and stride information described in @ref + * PadStrideInfo. + */ + void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayerUpsample + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data types supported: same as @p input. All + * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is + * only performed within the XY-plane. + * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled + * with zero. + * @param[in] info Contains padding and stride information described in @ref + * PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, const PadStrideInfo &info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + BorderSize _inner_border; + PadStrideInfo _info; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h new file mode 100644 index 000000000..d093c22cb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ +#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ + +#include "arm_compute/core/CPP/ICPPKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** CPP kernel to perform tensor upsample. + * + */ +class CPPUpsampleKernelEx : public ICPPKernel +{ +public: + const char *name() const override { return "CPPUpsampleKernelEx"; } + /** Default constructor */ + CPPUpsampleKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default; + /** Default destructor */ + ~CPPUpsampleKernelEx() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] info Padding info. + */ + void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; + +private: + const ITensor *_input; + ITensor *_output; + PadStrideInfo _info; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h new file mode 100644 index 000000000..358e0ebc6 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ +#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +class ITensor; +class Window; +class QuantizationInfo; +} // namespace arm_compute + +namespace arm_compute +{ + +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, + const float32x4_t &scale); + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, + const float32x4_t &invscale); + +float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale); + +void elementwise_op_quantized( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, + float32x4_t, float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, + int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)); +} // namespace arm_compute +#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h new file mode 100644 index 000000000..61992bd50 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ + +class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel +{ +public: + /** Default destructor */ + ~NEBinaryLogicalOperationKernel() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[in] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a Status + */ + static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1, + const ITensorInfo *input2, const ITensorInfo *output); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + const ITensorInfo &output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h new file mode 100644 index 000000000..fd2a2ee3b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTKERNEL_H__ +#define __ARM_COMPUTE_NECASTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the cast layer kernel. */ +class NECastKernel : public INEKernel +{ +public: + const char *name() const override { return "NECastKernel"; } + /** Default constructor */ + NECastKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastKernel(const NECastKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastKernel &operator=(const NECastKernel &) = delete; + /** Default Move Constructor. */ + NECastKernel(NECastKernel &&) = default; + /** Default move assignment operator */ + NECastKernel &operator=(NECastKernel &&) = default; + /** Default destructor */ + ~NECastKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(const ITensor *input, ITensor *output, SubDataType input_subtype); + /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + SubDataType _input_subtype; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h new file mode 100644 index 000000000..5b6ef6bfb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depth to space kernel */ +class NEDepthToSpaceLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; } + /** Default constructor */ + NEDepthToSpaceLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default; + /** Default destructor */ + ~NEDepthToSpaceLayerKernelEx() = default; + /** Initialise the kernel's inputs and output. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape x value. + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEDepthToSpaceLayerKernelEx. + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h new file mode 100644 index 000000000..d6fad1155 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ +#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for an element-wise unary operation kernel + * + * Element-wise operation is computed by: + * @f[ output(x) = OP(input(x))@f] + * + */ +class NEElementwiseUnaryKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEElementwiseUnaryKernelEx"; } + /** Default constructor */ + NEElementwiseUnaryKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default; + /** Default destructor */ + ~NEElementwiseUnaryKernelEx() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEElementwiseUnaryKernelEx + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] input First tensor input. Data types supported: F16/F32/S32. + * @param[in] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEElementwiseUnaryKernelEx + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] input First tensor input info. Data types supported: F16/F32/S32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a Status + */ + static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input, + const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + + /** Common signature for all the specialised arithmetic functions + * + * @param[in] input An input tensor. Data types supported: F16/F32/S32. + * @param[out] output The output tensor. Data types supported: Same as @p input. + * @param[in] window Region on which to execute the kernel. + */ + using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output, + const Window &window); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output); + + /** Function to use for the particular tensor types passed to configure() */ + std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function; + + const ITensor *_input; + ITensor *_output; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h new file mode 100644 index 000000000..1490e75f2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform EmbeddingLookup operation */ +class NEEmbeddingLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEEmbeddingLookupKernel"; } + /** Default constructor */ + NEEmbeddingLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups are 1D tensor that values are indices into the first dimension of + * input. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEEmbeddingLookupKernel + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + const ITensor *_lookups; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h new file mode 100644 index 000000000..3fa9c6e9a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ +#define __ARM_COMPUTE_NEGATHERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform other operation on NEON */ +class NEGatherKernelEx : public INEKernel +{ +public: + /** Default constructor. */ + NEGatherKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx(const NEGatherKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx(NEGatherKernelEx &&) = default; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default; + /** Default detructor */ + ~NEGatherKernelEx() = default; + + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEGatherKernelEx"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the gather operation for 0 axis. + * + * For gather on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info); + + /** Implementation of the gather operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info); + + using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info); + + const ITensor *_input; + const ITensor *_indices; + int _axis; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h new file mode 100644 index 000000000..d8976e7d0 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HashtableLookup operation */ +class NEHashtableLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEHashtableLookupKernel"; } + /** Default constructor */ + NEHashtableLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default; + /** Initialize the kernel's inputs, outputs. + * + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * input. + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEHashtableLookupKernel + * + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_lookups; /** Lookups tensor */ + const ITensor *_keys; /** Keys tensor */ + const ITensor *_input; /** Source tensor */ + ITensor *_output; /** Destination tensor */ + ITensor *_hits; /** Hits tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..76e2587af --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for performing an instance normalization */ +class NEInstanceNormalizationLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; } + /** Default constructor */ + NEInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx & + operator=(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx & + operator=(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~NEInstanceNormalizationLayerKernelEx() = default; + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * In case of @p output tensor = nullptr this tensor will store the result + * of the normalization. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Common signature for all the specialized instance normalization functions + * + * @param[in, out] input An input tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * @param[out] output The output tensor. + * @param[in] gamma The scale scalar value applied to the normalized tensor. Defaults to + * 1.0 + * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to + * 0.0 + * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 + */ + using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window); + + NormalizationFunction *_func; + ITensor *_input; + ITensor *_output; + ITensor *_gamma; + ITensor *_beta; + float _epsilon; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h new file mode 100644 index 000000000..723b14523 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class NEMultiplyScaleFactorKernel : public INEKernel +{ +public: + const char *name() const override { return "NEMultiplyScaleFactorKernel"; } + /** Default constructor */ + NEMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~NEMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + */ + void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier = 1.f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void multiply(const Window &window); + +private: + const ITensor *_input; + const ITensor *_scale_factor; + ITensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h new file mode 100644 index 000000000..79bb78661 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__ +#define __ARM_COMPUTE_NEPRELUKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform Parametric Rectified Linear Unit + * + * Result is computed by: + * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f] + */ +class NEPReLUKernel : public INEKernel +{ +public: + const char *name() const override { return "NEPReLUKernel"; } + /** Default constructor */ + NEPReLUKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPReLUKernel(const NEPReLUKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPReLUKernel &operator=(const NEPReLUKernel &) = delete; + /** Allow instances of this class to be moved */ + NEPReLUKernel(NEPReLUKernel &&) = default; + /** Allow instances of this class to be moved */ + NEPReLUKernel &operator=(NEPReLUKernel &&) = default; + /** Initialise the kernel's inputs and output + * + * @param[in] input Input tensor. Data type supported: QASYMM8/F32 + * @param[in] alpha Alpha tensor. Data types supported: Same as @p input + * @param[out] output Output tensor. Data types supported: Same as @p input + */ + void configure(const ITensor *input, const ITensor *alpha, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEPReLUKernel.h + * + * @param[in] input Input tensor input info. Data types supported: QASYMM8/F32. + * @param[in] alpha Alpha tensor input info. Data types supported: Same as @p input. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a Status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, + const ITensorInfo *output); + static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, + const ITensorInfo &output); + +private: + const ITensor *_input; /**< Source tensor */ + const ITensor *_alpha; /**< Alpha tensor */ + ITensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h new file mode 100644 index 000000000..590b23873 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the dequantization layer kernel. */ +class NEQuantizationSymmetricKernel : public INEKernel +{ +public: + const char *name() const override { return "NEQuantizationSymmetricKernel"; } + /** Default constructor */ + NEQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~NEQuantizationSymmetricKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output, ITensor *scale_factor); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void quantize(const Window &window); + +private: + const ITensor *_input; + ITensor *_output; + ITensor *_scale_factor; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h new file mode 100644 index 000000000..73991b67d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ +#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a reduction operation */ +class NEReductionOperationKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEReductionOperationKernelEx"; } + /** Default constructor */ + NEReductionOperationKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default; + /** Default destructor */ + ~NEReductionOperationKernelEx() = default; + + /** Set the source, destination of the kernel + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: + * NCHW. + * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] op Reduction operation to perform. + */ + void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReductionOperationKernelEx. + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts + * supported: NCHW. + * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p + * input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] op Reduction operation to perform. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; + ITensor *_output; + unsigned int _reduction_axis; + ReduceOperation _op; + BorderSize _border_size; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h new file mode 100644 index 000000000..5d697c2b2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the space to depth kernel */ +class NESpaceToDepthLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NESpaceToDepthLayerKernelEx"; } + /** Default constructor */ + NESpaceToDepthLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default; + /** Default destructor */ + ~NESpaceToDepthLayerKernelEx() = default; + /** Initialise the kernel's inputs and output. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToDepthLayerKernelEx + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ + int32_t _block_shape; /**< Block shape */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..3b0902f08 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available reduce operations */ +enum class ReduceOperation +{ + MAX, /**< Max */ + MEAN, /**< Mean */ + SUM, /**< Sum */ + MIN, /**< Min */ +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperationEx +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +enum class ElementWiseUnaryEx +{ + NEG, /**< NEG */ +}; + +enum class SubDataType +{ + NONE, + BOOL, +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..39026e6bb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include <utility> + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ + +/** Returns expected width and height of the transpose convolution's output tensor. + * + * @note This function was copied in order to fix a bug computing to wrong output dimensions. + * + * @param[in] in_width Width of input tensor (Number of columns) + * @param[in] in_height Height of input tensor (Number of rows) + * @param[in] kernel_width Kernel width. + * @param[in] kernel_height Kernel height. + * @param[in] info padding and stride info. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_top The number of zeros added to bottom edge of the output. + * + * @return A pair with the new width in the first position and the new height in the second. + */ +const std::pair<unsigned int, unsigned int> +transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_top); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h new file mode 100644 index 000000000..16fd40ed9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ +#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Utils.h" + +#include "arm_compute/core/utils/helpers/tensor_transform.h" + +#include <cmath> + +namespace arm_compute +{ +namespace misc +{ +namespace shape_calculator +{ + +/** Calculate the upsampled output shape used for transpose convolution + * + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * @param[in] info Padding and stride info + * @param[in] out_dims Output shape dimensions + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[out] pad_left Padding on left + * @param[out] pad_right Padding on right + * @param[out] pad_top Padding on top + * @param[out] pad_bottom Padding on bottom + * + * @return the calculated shape + */ +inline TensorShape compute_transposeconv_upsampled_shape( + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) +{ + unsigned int sx = info.stride().first; + unsigned int sy = info.stride().second; + const DataLayout data_layout = input.data_layout(); + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Find the upsampled dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + // upsample out: + // upsample_out = 1 + (in - 1) * stride + unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1; + unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1; + + // Find the padding needed for the convolution with stride 1 in order to match output shape + // upsample+pad out: + // upsample_out + pad = tconv_out + kernel - 1 + // pad = tconv_out + kernel - 1 - upsample_out + unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1); + unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1); + out_x += padx; + out_y += pady; + + unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; + unsigned int pady_all_except_invallid = + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); + pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; + pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); + pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom; + + TensorShape scale_out_shape(input.tensor_shape()); + scale_out_shape.set(idx_w, out_x); + scale_out_shape.set(idx_h, out_y); + + return scale_out_shape; +} + +/** Calculate the output shape of the transpose convolution layer + * + * @param[in] out_dims Output x and y shape dimensions + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * + * @return the calculated shape + */ +inline TensorShape +compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, + const ITensorInfo &input, const ITensorInfo &weights) +{ + const TensorShape input_shape{input.tensor_shape()}; + const TensorShape weights_shape{weights.tensor_shape()}; + + const DataLayout data_layout = input.data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + TensorShape out_shape{input_shape}; + out_shape.set(width_idx, out_dims.first); + out_shape.set(height_idx, out_dims.second); + out_shape.set(channel_idx, weights_shape[batch_idx]); + return out_shape; +} + +/** Calculate the depth to space output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block) +{ + ARM_COMPUTE_ERROR_ON(block < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(idx_width, input->dimension(idx_width) * block); + output_shape.set(idx_height, input->dimension(idx_height) * block); + output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block)); + + return output_shape; +} + +/** Calculate the space to batch output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block_shape Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON(block_shape < 2); + TensorShape output_shape{input->tensor_shape()}; + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape); + output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape); + output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape)); + + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape, + const TensorShape &indices_shape, uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); + ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); + + TensorShape output_shape = input_shape; + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } + return output_shape; +} + +} // namespace shape_calculator +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h new file mode 100644 index 000000000..831bb5423 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ +#define __ARM_COMPUTE_CLFUNCTIONSEX_H__ + +#include <arm_compute/runtime/CL/functions/CLArgOperation.h> +#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h> +#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h> +#include <arm_compute/runtime/CL/functions/CLCast.h> +#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h> +#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> +#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h> +#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLLogicalNot.h> +#include <arm_compute/runtime/CL/functions/CLNeg.h> +#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h> +#include <arm_compute/runtime/CL/functions/CLPReLU.h> +#include <arm_compute/runtime/CL/functions/CLReduceOperation.h> +#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLSpaceToBatchND.h> +#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h> +#include <arm_compute/runtime/CL/functions/CLSplit.h> +#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h> +#include <arm_compute/runtime/CL/functions/CLTopKV2.h> +#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h> + +#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h new file mode 100644 index 000000000..d9d0d4d35 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLArgOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLArgOperation class + */ + +#ifndef __ARM_COMPUTE_CLARGOPERATION_H__ +#define __ARM_COMPUTE_CLARGOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute CLArgOperation operation + */ +class CLArgOperation : public IFunction +{ +public: + /** + * @brief Construct a new CLArgOperation object + */ + CLArgOperation(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgOperation(const CLArgOperation &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLArgOperation &operator=(const CLArgOperation &) = delete; + + /** + * @brief Construct a new CLArgOperation object by using copy constructor + * @param[in] CLArgOperation object to move + */ + CLArgOperation(CLArgOperation &&) = default; + + /** + * @brief Assign a CLArgOperation object. + * @param[in] CLArgOperation object to assign. This object will be moved. + */ + CLArgOperation &operator=(CLArgOperation &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[out] output The result of arg operation. Data types supported: S32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] op Arg operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[out] output The result of arg operation. Data types supported: S32. + * @param[in] op Arg operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis, + const ITensorInfo *output, ArgOperation op); + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + ICLTensor *_input{nullptr}; + ICLTensor *_output{nullptr}; + std::vector<uint32_t> _axis{}; + ArgOperation _arg_op{ArgOperation::MAX}; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr}; + size_t _num_of_kernels{0}; +}; +} +#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h new file mode 100644 index 000000000..d16a0762d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ +#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLBatchToSpaceNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLBatchToSpaceND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] block_size A pointer to an array of integer values specifying block sizes + * for spatial dimension. + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..061e34f26 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h new file mode 100644 index 000000000..36acfaed7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLCast.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCast class + */ + +#ifndef __ARM_COMPUTE_CLCAST_H__ +#define __ARM_COMPUTE_CLCAST_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLCastKernel. + * This converts the input tensor to the tensor of the output tensor's type. + */ +class CLCast : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's input and output + * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype); +}; +} +#endif /* __ARM_COMPUTE_CLCAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h new file mode 100644 index 000000000..d78a6ada4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__ +#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLDepthToSpaceKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLDepthToSpace : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; +} // namesace arm_compute + +#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..257772a89 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..fd0a65f20 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       CLFullyConnectedReshapingLayer.h + * @brief      This file contains CLFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> +#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class CLFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _cl_fc{memory_manager}, _cl_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @return N/A + */ + void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + const arm_compute::ICLTensor *_input; + const arm_compute::ICLTensor *_weights; + const arm_compute::ICLTensor *_biases; + arm_compute::ICLTensor *_output; + + // buffer for reshaping input tensor + arm_compute::CLTensor _cl_buffer; + +private: + arm_compute::CLFullyConnectedLayer _cl_fc; + // TODO Change to CLReshapeLayer + arm_compute::misc::GenericReshapeLayer _cl_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h new file mode 100644 index 000000000..04d227aa7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLGatherEx.h + * @brief This file contains CLGatherEx class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLGATHEREX_H__ +#define __ARM_COMPUTE_CLGATHEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to to run @ref CLGatherKernel. + */ +class CLGatherEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGatherEx + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..65aa6cbd5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..ed29db925 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref CLInstanceNormalizationLayerKernelEx + */ +class CLInstanceNormalizationLayerEx : public ICLSimpleFunction +{ +public: + /** Default constructor */ + CLInstanceNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h new file mode 100644 index 000000000..4bf203c5a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__ +#define __ARM_COMPUTE_CLLOGICALNOT_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLLogicalNot : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: QASYMM8. + * @param[out] output Output tensor. Data types supported: QASYMM8. + */ + void configure(ICLTensor *input, ICLTensor *output); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..198a0fd4e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h new file mode 100644 index 000000000..622a61b5e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPRELU_H__ +#define __ARM_COMPUTE_CLPRELU_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLPReLU : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input. Data types supported: + * QASYMM8/F16/F32. + * @param[in] alpha. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPRELU_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h new file mode 100644 index 000000000..b142d3a2e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLPixelWiseDivision.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLPixelWiseDivision class + */ +#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ +#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLPixelWiseDivisionKernel. + */ +class CLPixelWiseDivision : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in, out] input1 An input tensor. Data types supported: U8/S16/F16/F32 + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * Note: U8 requires both inputs to be U8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or + * 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest + * even. + * @return N/A + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLPixelWiseDivision + * @param[in] input1 An input tensor info. Data types supported: U8/S16/F16/F32 + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * Note: U8 requires both inputs to be U8. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n + * where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); +}; +} +#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h new file mode 100644 index 000000000..7e88cb369 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__ +#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__ + +#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLCopyKernel.h" +#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLRNNLayerEx */ +class CLRNNLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that + * multiplies the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies + * the current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same + * as @p input + * @param[out] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + */ + void configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, + ICLTensor *output, ActivationLayerInfo &info); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies + * the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the + * current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p + * input + * @param[in] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + CLMemoryGroup _memory_group; + CLGEMM _gemm_state_f; + CLSaturatedArithmeticOperationKernel _add_kernel; + CLActivationLayerKernel _activation_kernel; + CLFullyConnectedLayer _fully_connected_kernel; + CLCopyKernel _copy_kernel; + CLTensor _fully_connected_out; + CLTensor _gemm_output; + CLTensor _add_output; + bool _is_prepared; +}; +} +#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..1d367d56b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + bool keep_dims, ReduceOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + CLMemoryGroup _memory_group; + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + bool _keep_dims; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; + CLReshapeLayer _reshape; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h new file mode 100644 index 000000000..7e2df8986 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ +#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToBatchNDKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32. + * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape + * block_shape, and interleaves these blocks with the "batch" dimension such that in the output. + */ +class CLSpaceToBatchND : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @note The data layout of input and output must be the same. + * @note The number of dimensions of input and output must be 4, and `spatial` dimensions + * are height and width. + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32. + * Data layout supported: NCHW/NHWC + * @param[in] block_size Tensor of integer values specifying block sizes for spatial + * dimension. + * Data types supported: S32 + * @param[in] padding_size Tensor of integer values specifying padding sizes for spatial + * dimension. + * Data types supported: S32 + * @param[out] output Output tensor. Data types supported: same as @p input. + * Data layout supported: NCHW/NHWC + */ + void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size, + ICLTensor *output); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h new file mode 100644 index 000000000..17f762092 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__ +#define __ARM_COMPUTE_CLSPACETODEPTH_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSpaceToDepthKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLSpaceToDepth : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[block_size] block size integer only + */ + void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size); +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h new file mode 100644 index 000000000..6b26a85c8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLStridedSlice.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class + */ + +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLStridedSliceKernel + */ +class CLStridedSliceEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs and outputs + * @param[in] input Tensor input. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] beginData 'begin' vector of strided slice operation + * @param[in] endData 'end' vector of strided slice operation + * @param[in] stridesData 'strides' vector of strided slice operation + * @param[in] beginMask If the ith bit is set, begin[i] is ignored + * @param[in] endMask If the ith bit is set, end[i] is ignored + * @param[in] shrinkAxisMask If the ith bit is set, the ith specification shrinks the + * dimensionality by 1, taking on the value at index begin[i] + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); +}; +} +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h new file mode 100644 index 000000000..20c749e0b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ +#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ +#define __ARM_COMPUTE_CLTOPK_V2_H__ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute TopKV2 operation. + */ +class CLTopKV2 : public IFunction +{ +public: + /** + * @brief Construct a new CLTopKV2 object + */ + CLTopKV2(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2(const CLTopKV2 &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2 &operator=(const CLTopKV2 &) = delete; + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ + CLTopKV2(CLTopKV2 &&) = default; + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ + CLTopKV2 &operator=(CLTopKV2 &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[in] k The value of `k`. + * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if + * input type is F32. + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type + * is U8/S16, F32 if input type is F32. + * @return N/A + */ + void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits = 32, int bits = 4); + + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ + void run() override; + +private: + void run_on_cpu(); + void run_on_gpu(); + void run_on_gpu_single_quicksort(); + + uint32_t _k; + uint32_t _total_bits; + uint32_t _bits; + uint32_t _radix; + uint32_t _hist_buf_size; + uint32_t _glob_sum_buf_size; + uint32_t _n; + + ICLTensor *_input; + ICLTensor *_values; + ICLTensor *_indices; + + cl::Buffer _qs_idx_buf; + cl::Buffer _qs_temp_buf; + cl::Buffer _hist_buf; + cl::Buffer _glob_sum_buf; + cl::Buffer _temp_buf; + cl::Buffer _first_negative_idx_buf; + cl::Buffer _in_key_buf; + cl::Buffer _out_key_buf; + cl::Buffer _in_ind_buf; + cl::Buffer _out_ind_buf; + + cl::Buffer *_p_in_key_buf; + cl::Buffer *_p_out_key_buf; + cl::Buffer *_p_in_ind_buf; + cl::Buffer *_p_out_ind_buf; +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + CLTopKV2Single _qs_kernel; + CLTopKV2Init _init_kernel; + CLRadixSortHistogram _hist_kernel; + CLRadixSortScanHistogram _scan_hist_kernel; + CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; + CLRadixSortPasteHistogram _paste_hist_kernel; + CLRadixSortReorder _reorder_kernel; + CLTopKV2FindFirstNegative _find_first_negative_kernel; + CLTopKV2ReorderNegatives _reorder_negatives_kernel; + CLTopKV2Store _store_kernel; +#endif +}; +} +#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h new file mode 100644 index 000000000..340a7bfe9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" + +#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" + +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +class ICLTensor; +/** Function to run the transpose convolution layer. + * + * @note This layer was copied in order to fix a bug computing to wrong output dimensions. + * + * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perform a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finally a is a user + * specified value where a < stride - 1, that increases the padding top and right of the input + * image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y + * \f] + * + * where: + * width_input is the size of the first input dimension. + * height_input is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using the @ref + * CPPFlipWeightsKernel. + * + * This function calls the following OpenCL kernels/functions: + * + * -# @ref CLTransposeConvLayerUpsample + * -# @ref CLConvolutionLayer + * + */ +class CLTransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayer(const CLTransposeConvLayer &) = delete; + /** Default move constructor */ + CLTransposeConvLayer(CLTransposeConvLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete; + /** Default move assignment operator */ + CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the + * transpose convolution, this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been + * reshaped with @ref CLWeightsReshapeKernel. + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the + * transpose convolution, this is decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int innvalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + CLMemoryGroup _memory_group; + CLTransposeConvLayerUpsample _scale_f; + CLConvolutionLayer _conv_f; + CPPFlipWeightsKernel _flip_weights; + CLTensor _scaled_output; + ICLTensor *_original_weights; + CLTensor _weights_flipped; + bool _is_prepared; +}; +} +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h new file mode 100644 index 000000000..4ae0e1830 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */ +class CLTransposeConvLayerUpsample : public IFunction +{ +public: + /** Default constructor */ + CLTransposeConvLayerUpsample(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete; + /** Allow instances of this class to be moved */ + CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default; + /** Allow instances of this class to be moved */ + CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default; + /** Default destructor */ + virtual ~CLTransposeConvLayerUpsample() = default; + + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] inner_border The number of zeros added to right and top edges of the input. + * @param[in] info Contains padding and policies to be used in the deconvolution. + */ + void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border, + const PadStrideInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayerUpsample + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input. + * @param[in] inner_border The number of zeros added to right and top edges of the input. + * @param[in] info Contains padding and policies to be used in the deconvolution. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, const PadStrideInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + CLTransposeConvLayerUpsampleKernel _upsample; + ICLTensor *_output; +}; +} +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h new file mode 100644 index 000000000..8e7e2f937 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ +#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ + +#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref CPPUpsample */ +class CPPUpsampleEx : public ICPPSimpleFunction +{ +public: + /** Configure the upsample CPP kernel + * + * @param[in] input The input tensor to upsample. Data types supported: F32/F16/QASYMM8 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] info Padding information + */ + void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info); +}; +} +#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h new file mode 100644 index 000000000..37bccc52c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ +#define __ARM_COMPUTE_NEFUNCTIONSEX_H__ + +#include <arm_compute/runtime/NEON/functions/NEArgMinMax.h> +#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> +#include <arm_compute/runtime/NEON/functions/NECast.h> +#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/NEON/functions/NEGatherEx.h> +#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h> +#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEPReLU.h> +#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h> +#include <arm_compute/runtime/NEON/functions/NEReduceSum.h> +#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h> +#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h> + +#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h new file mode 100644 index 000000000..604cd93c4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEArgMinMax.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ +#define __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce min/max operation */ +template <ReductionOperation op> class NEArgMinMaxStatic : public IFunction +{ +public: + /** Constructor */ + NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] axis Reduction axis. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, int axis, ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEArgMinMax + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] axis Reduction axis. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEArgMinMaxLayer _reduction_kernel; + Tensor _reduced_out; + NEReshapeLayer _reshape; +}; + +/** Basic function to run arg max. */ +using NEArgMax = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>; +/** Basic function to run arg min. */ +using NEArgMin = NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_ARG_MIN_MAX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h new file mode 100644 index 000000000..2a624656d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +class NEBinaryLogicalOperation : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op); +}; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8 + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8 + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); +}; + +/** Basic function to run equal comparison. */ +using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +/** Basic function to run not equal comparison. */ +using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h new file mode 100644 index 000000000..ae2f57f19 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECAST_H__ +#define __ARM_COMPUTE_NECAST_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */ +class NECast : public INESimpleFunctionNoBorder +{ +public: + /** Configure the kernel. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + */ + void configure(const ITensor *input, ITensor *output, + SubDataType input_subtype = SubDataType::NONE); + /** Static function to check if given info will lead to a valid configuration of @ref NECast + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32. + * @param[in] input_subtype Sub data type of input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype = SubDataType::NONE); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NECAST_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h new file mode 100644 index 000000000..90c0751b8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ +#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */ +class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value. + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEDepthToSpaceLayerEx. + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape x value. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h new file mode 100644 index 000000000..f0c8ecdb5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ +#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform negative on an input tensor. */ +class NENegLayer : public INESimpleFunction +{ +public: + /** Initialize the function + * + * @param[in] input Input tensor. Data types supported: F16/F32/S32. + * @param[out] output Output tensor. Data types supported: same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer + * + * @param[in] input First tensor input info. Data types supported: F16/F32/S32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h new file mode 100644 index 000000000..0646f1668 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NEEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class NEEmbeddingLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32. + * @return N/A + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] input Source tensor info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * @param[in] output Lookups tensor info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); +}; +} +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h new file mode 100644 index 000000000..42a786821 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls + * the following kernels: + * + * -# @ref NETransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NEQuantizationSymmetricKernel _quant_input_kernel; + NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp; + NEMultiplyScaleFactorKernel _multiply_scale_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _reshape_weights_output; + Tensor _quantized_input; + Tensor _scale_factor; + Tensor _gemmlowp_output; + const ITensor *_original_weights; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h new file mode 100644 index 000000000..6bd67f322 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + * @note The difference from NEFullyConnectedLayer is that this class supports weights as input + * with performance loss. + */ +class NEFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFlattenLayerKernel _flatten_kernel; + NEConvertFullyConnectedWeights _convert_weights; + NEFullyConnectedLayerReshapeWeights _reshape_weights_function; + NEGEMM _mm_gemm; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _flatten_output; + Tensor _gemmlowp_output; + Tensor _converted_weights_output; + Tensor _reshape_weights_output; + const ITensor *_original_weights; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; + bool _is_quantized; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..18cb61bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       NEFullyConnectedReshapingLayer.h + * @brief      This file contains NEFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> +#include <arm_compute/runtime/Tensor.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class NEFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @param[in] kernel_type The kernel type for actual FullyConnected layer + * @return N/A + */ + void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, arm_compute::ITensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + const arm_compute::ITensor *_input; + const arm_compute::ITensor *_weights; + const arm_compute::ITensor *_biases; + arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + arm_compute::Tensor _neon_buffer; + +private: + std::unique_ptr<arm_compute::IFunction> _neon_fc; + NEReshapeLayer _neon_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h new file mode 100644 index 000000000..414b9f7d9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ +#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following + * NEON kernels if the DOT product instruction is not available: + * + * -# @ref NEGEMMInterleave4x4Kernel + * -# @ref NEGEMMTranspose1xWKernel + * -# @ref NEGEMMLowpMatrixMultiplyKernel + * -# @ref NEGEMMLowpOffsetContributionKernel + * -# @ref NEActivationLayer + * + * otherwise if the DOT product instruction is available: + * + * -# @ref NEGEMMLowpOffsetContributionKernel + * +*/ +class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction +{ +public: + /** Constructor */ + NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; + /** Default move constructor */ + NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete; + /** Default move assignment operator */ + NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default; + /** Initialise the kernel's inputs, output + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is + * QASYMM8/QASYMM8_SIGNED otherwise + * + * @param[in] a First input tensor (Matrix A). Data type supported: + * QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: + * S32 + * @param[out] output Output tensor. Data type supported: Data type supported: + * S32/QASYMM8/QASYMM8_SIGNED + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped + * and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGEMMLowpMatrixMultiplyCoreEx + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is + * QASYMM8/QASYMM8_SIGNED otherwise + * + * @param[in] a First input tensor info (Matrix A). Data type supported: + * QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type + * supported: S32 + * @param[in] output Output tensor info. Data type supported: Data type supported: + * S32/QASYMM8/QASYMM8_SIGNED + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped + * and + * if the reshape of matrix B should be executed only for the first run + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, + const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEGEMMAssemblyDispatch _asm_glue; + std::unique_ptr<INEKernel> _mm_kernel; + std::unique_ptr<INEKernel> _mtx_a_reshape_kernel; + std::unique_ptr<INEKernel> _mtx_b_reshape_kernel; + NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel; + // NEActivationLayer _activation_func; + + Tensor _vector_sum_col; + Tensor _vector_sum_row; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _mm_result_s32; + Tensor _signed_a; + Tensor _signed_output; + const ITensor *_original_b; + int32_t _a_offset; + int32_t _b_offset; + + bool _run_vector_matrix_multiplication; + bool _assembly_path; + bool _fused_assembly_path; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _run_activation; + bool _flip_signedness; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h new file mode 100644 index 000000000..d95e6a81e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHEREX_H__ +#define __ARM_COMPUTE_NEGATHEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEGatherKernelEx */ +class NEGatherEx : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h new file mode 100644 index 000000000..69abf0192 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file NEHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class NEHashtableLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] lookups Lookups 1D tensor info. + * Data types supported: S32 + * @param[in] keys Keys 1D tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] hits Hits 1D tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); +}; +} +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..521f50d2f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref NEInstanceNormalizationLayerKernelEx + */ +class NEInstanceNormalizationLayerEx : public IFunction +{ +public: + /** Constructor */ + NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEInstanceNormalizationLayerKernelEx _normalization_kernel; + bool _is_nchw; + NEPermute _permute_input; + NEPermute _permute_output; + Tensor _permuted_input; + Tensor _permuted_output; +}; +} +#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h new file mode 100644 index 000000000..5664c57cb --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPRELU_H__ +#define __ARM_COMPUTE_NEPRELU_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEPReLUKernel */ +class NEPReLU : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and output + * + * @param[in] input. Data types supported: QASYMM8/F32. + * @param[in] alpha. Data types supported: Same as @p input. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(const ITensor *input, const ITensor *alpha, ITensor *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEPRELU_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h new file mode 100644 index 000000000..17c37d806 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__ +#define __ARM_COMPUTE_NERNNLAYER_EX_H__ + +#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" +#include "arm_compute/core/NEON/kernels/NECopyKernel.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to run @ref NERNNLayerEx */ +class NERNNLayerEx : public IFunction +{ +public: + /** Default constructor */ + NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERNNLayerEx(const NERNNLayerEx &) = delete; + /** Default move constructor */ + NERNNLayerEx(NERNNLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERNNLayerEx &operator=(const NERNNLayerEx &) = delete; + /** Default move assignment operator */ + NERNNLayerEx &operator=(NERNNLayerEx &&) = default; + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that + * multiplies the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies + * the current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same + * as @p input + * @param[out] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, + const ITensor *bias, ITensor *hidden_state, ITensor *output, + ActivationLayerInfo &info); + /** Initialize the function + * + * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data + * types supported: F16/F32 + * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies + * the input. Data types supported: Same as @p input + * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the + * current 'state'. Data types supported: Same as @p input + * @param[in] bias Bias vector of shape [num_units]. Data types supported: Same as @p + * input + * @param[in] output Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] hidden_state Output tensor of shape [num_units, batch_size]. Data types + * supported: Same as @p input + * @param[in] info Activation layer parameter. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEGEMM _gemm_state_f; + NEArithmeticAdditionKernel _add_kernel; + NEActivationLayerKernel _activation_kernel; + NEFullyConnectedLayer _fully_connected_kernel; + NECopyKernel _copy_kernel; + Tensor _fully_connected_out; + Tensor _gemm_output; + Tensor _add_output; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h new file mode 100644 index 000000000..7209acf19 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ +#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceMeanEx : public IFunction +{ +public: + /** Constructor */ + NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceMeanEx + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr}; + std::unique_ptr<Tensor[]> _reduced_outs{nullptr}; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h new file mode 100644 index 000000000..9c558e6a2 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ +#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceOperation : public IFunction +{ +public: + /** Constructor */ + NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output, + ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceOperation + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReduceOperation op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperationEx> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h new file mode 100644 index 000000000..c028ea658 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__ +#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceSum : public IFunction +{ +public: + /** Constructor */ + NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h new file mode 100644 index 000000000..7180742df --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ +#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to simulate a reduction operation. This function calls the following NEON + * kernels: + * + * -# @ref NEFillBorderKernel + * -# @ref NEReductionOperationKernelEx + * + */ +class NEReductionOperationEx : public IFunction +{ +public: + /** Default constructor */ + NEReductionOperationEx(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. + * @param[in] axis Dimension along which to reduce. + * @param[in] op Reduction operation to perform. + */ + void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReductionOperationEx. + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] axis Dimension along which to reduce. + * @param[in] op Reduction operation to perform. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op); + + // Inherited methods overridden: + void run() override; + +private: + NEReductionOperationKernelEx _reduction_kernel; + NEFillBorderKernel _fill_border_kernel; + size_t _window_split; + int _reduction_axis; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h new file mode 100644 index 000000000..302f9af2e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ +#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h" +#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to spatial divide a tensor. This function calls the following NEON + * kernels/functions: + * + * -# @ref NEMemsetKernel + * -# @ref NESpaceToBatchLayerKernel + */ +class NESpaceToBatchLayerEx : public IFunction +{ +public: + /** Default constructor */ + NESpaceToBatchLayerEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete; + /** Allow instances of this class to be moved */ + NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default; + /** Allow instances of this class to be moved */ + NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default; + /** Default destructor */ + virtual ~NESpaceToBatchLayerEx() = default; + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 + * @param[in] paddings 2-D tensor with shape [2, M]. Data types supported: S32 + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, + ITensor *output); + /** Set the input and output tensors. (Static block shape and paddings) + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[in] padding_left The left padding of the output tensor. + * @param[in] padding_right The right padding of the output tensor. + * @param[out] output Tensor output. Data types supported: same as @p input + */ + void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, + const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToBatchLayerEx + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32 + * @param[in] paddings paddings tensor info with shape [2, M]. Data types supported: S32 + * @param[in] output Tensor output info. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, + const ITensorInfo *paddings, const ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToBatchLayerEx (Static block shape and paddings) + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] block_shape_x Block shape x value. + * @param[in] block_shape_y Block shape y value. + * @param[in] padding_left The left padding of the output tensor. + * @param[in] padding_right The right padding of the output tensor. + * @param[in] output Tensor output info. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, + const Size2D &padding_left, const Size2D &padding_right, + const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */ + NEMemsetKernel _memset_kernel; /**< Memset kernel to run */ + bool _has_padding; /**< Flag to check if the output has padding */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h new file mode 100644 index 000000000..117717b55 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ +#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** This function calls the following NEON kernels/functions: + * + * -# @ref NESpaceToDepthLayerKernelEx + */ +class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] block_shape Block shape value + */ + void configure(const ITensor *input, ITensor *output, int32_t block_shape); + /** Static function to check if given info will lead to a valid configuration of @ref + * NESpaceToDepthLayerEx (Static block shape and paddings) + * + * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Tensor output info. Data types supported: same as @p input + * @param[in] block_shape Block shape value + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h new file mode 100644 index 000000000..a50b9ea60 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" + +#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +/** Function to run the deconvolution layer. + * + * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the + * input depending on the stride and pad info and then perfrom a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finaly a is a user + * specified value where a < stride - 1 that increases the padding top and right of the input image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where + * width is the size of the first input dimension. + * height is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Transpose convolution are supposed to be the same as the ones used for + * Convolution. Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using the @ref + * CPPFlipWeightsKernel. + * + * This function calls the following NEON kernels/functions: + * + * -# @ref CPPUpsample + * -# @ref NEConvolutionLayer + * + */ +class NETransposeConvLayer : public IFunction +{ +public: + /** Default constructor */ + NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer(const NETransposeConvLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; + /** Allow instances of this class to be moved */ + NETransposeConvLayer(NETransposeConvLayer &&) = default; + /** Allow instances of this class to be moved */ + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + /** Default destructor */ + virtual ~NETransposeConvLayer() = default; + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type + * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + /** Static function to check if given info will lead to a valid configuration of @ref + * NETransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types + * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to top edge of the output. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEConvolutionLayer _conv_f; + CPPUpsampleEx _upsample_f; + CPPFlipWeightsKernel _flip_weights; + NEPermute _permute_input; + NEPermute _permute_weights; + NEPermute _permute_output; + Tensor _scaled_output; + Tensor _weights_flipped; + Tensor _permuted_input; + Tensor _permuted_weights; + Tensor _permuted_output; + bool _is_nchw; + const ITensor *_original_weights; + ITensor *_input; + PadStrideInfo _info; + bool _is_prepared; +}; +} // arm_compute +#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h new file mode 100644 index 000000000..3db0c7e5e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       GenericGather.h + * @brief      This file contains GenericGather class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ +#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLPermute.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> + +#include "Utils.h" + +namespace arm_compute +{ +namespace misc +{ + +/** + * @brief Class to run Gather with both CPU and GPU + */ +class GenericGather : public arm_compute::IFunction +{ +public: + GenericGather(void) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] indices The indices tensor + * @param[in] output The destination tensor + * @param[in] axis (Optional) The axis in input to gather indices from + * @return N/A + */ + void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis = 0); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + +private: + arm_compute::ITensor *_input{nullptr}; + arm_compute::ITensor *_indices{nullptr}; + arm_compute::ITensor *_output{nullptr}; + int _axis{0}; + arm_compute::CLTensor _cl_permuted; + +private: + arm_compute::CLPermute _cl_permute; + arm_compute::CLGatherEx _cl_gather; +}; + +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h new file mode 100644 index 000000000..ab2fdc71d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/GenericReshapeLayer.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       GenericReshapeLayer.h + * @brief      This file contains GenericReshapeLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ +#define __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ + +#include <arm_compute/runtime/Tensor.h> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/runtime/CL/functions/CLPermute.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/NEON/functions/NEPermute.h> +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> + +#include "Utils.h" + +namespace arm_compute +{ +namespace misc +{ + +/** + * @brief Class to run Reshape Layer with both CPU and GPU + */ +class GenericReshapeLayer : public arm_compute::IFunction +{ +public: + GenericReshapeLayer(void) + : _input(nullptr), _output(nullptr), _cl_permuted{}, _neon_permuted{}, _cl_permute{}, + _cl_reshape{}, _neon_permute{}, _neon_reshape{} + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] output The destination tensor + * @return N/A + */ + void configure(const arm_compute::ITensor *input, arm_compute::ITensor *output); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + +private: + const arm_compute::ITensor *_input; + arm_compute::ITensor *_output; + arm_compute::CLTensor _cl_permuted; + arm_compute::Tensor _neon_permuted; + +private: + arm_compute::CLPermute _cl_permute; + arm_compute::CLReshapeLayer _cl_reshape; + + arm_compute::NEPermute _neon_permute; + arm_compute::NEReshapeLayer _neon_reshape; +}; + +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_GENERIC_RESHAPE_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h new file mode 100644 index 000000000..53736f55f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/misc/functions/Utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file utils.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains utils for arm compute library + */ +#ifndef __ARM_COMPUTE_MISC_UTILS_H__ +#define __ARM_COMPUTE_MISC_UTILS_H__ + +#include <string> +#include <cassert> +#include <arm_compute/runtime/CL/CLTensor.h> + +#include <arm_compute/core/Coordinates.h> +#include <arm_compute/core/TensorInfo.h> +#include <arm_compute/core/TensorShape.h> +#include <arm_compute/core/Types.h> + +// TODO : It should be extracted to independent module. + +namespace arm_compute +{ +namespace misc +{ +namespace utils +{ + +/** + * @brief Check if this runtime runs on GPU or NEON + * @return @c true if GPU mode, otherwise @c false + */ +bool isGpuMode(); + +#ifndef CAST_CL +#define CAST_CL(tensor) static_cast<::arm_compute::CLTensor *>(tensor) +#endif + +#ifndef CAST_NE +#define CAST_NE(tensor) static_cast<::arm_compute::Tensor *>(tensor) +#endif + +/** +* @brief Generate arm compute permutation vector from runtime permutation vector +* @param[in] rank Rank number supported upto 4 +* @param[in] runtime_pv Integer array for runtime permutation vector +* @return Permutation vector of arm compute +*/ +arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank, + const int32_t *runtime_pv); + +/** + * @brief Set value to arm compute tensor with casting + * @param[in] value Value to set + * @param[out] to Target tensor of arm compute + * @param[in] id Position of element + * @return N/A + */ +template <typename FromT> +void copyCast(const FromT value, arm_compute::ITensor *to, const arm_compute::Coordinates &id) +{ + switch (to->info()->data_type()) + { + case arm_compute::DataType::F32: + { + *reinterpret_cast<float *>(to->ptr_to_element(id)) = static_cast<float>(value); + break; + } + case arm_compute::DataType::S32: + { + *reinterpret_cast<int32_t *>(to->ptr_to_element(id)) = static_cast<int32_t>(value); + break; + } + case arm_compute::DataType::U32: + { + *reinterpret_cast<uint32_t *>(to->ptr_to_element(id)) = static_cast<uint32_t>(value); + break; + } + case arm_compute::DataType::QASYMM8: + { + float realValue = static_cast<float>(value); + // NOTE We haven't known the policy of rounding for quantization. + // So this is set to a temporary value. + *(to->ptr_to_element(id)) = + to->info()->quantization_info().quantize(realValue, arm_compute::RoundingPolicy::TO_ZERO); + break; + } + default: + throw std::runtime_error("Not supported, yet"); + break; + } +} + +} // namespace utils +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_UTILS_H__ diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py new file mode 100644 index 000000000..b3e252892 --- /dev/null +++ b/compute/ARMComputeEx/resolve_includes.py @@ -0,0 +1,102 @@ +# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +# Copyright (c) 2016, 2017 ARM Limited. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import collections +import os.path +import re +import subprocess +import glob + + +def resolve_includes(target, source): + # File collection + FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') + + # Include pattern + pattern = re.compile("#include \"(.*)\"") + + # Get file contents + files = [] + for i in range(len(source)): + src = source[i] + dst = target[i] + f = open(src) + cts = f.read() + f.close() + contents = cts.splitlines() + entry = FileEntry(target_name=dst, file_contents=contents) + files.append((os.path.basename(src), entry)) + + # Create dictionary of tupled list + files_dict = dict(files) + + # Check for includes (can only be files in the same folder) + final_files = [] + for file in files: + done = False + tmp_file = file[1].file_contents + print(file[1].target_name) + while not done: + file_count = 0 + updated_file = [] + for line in tmp_file: + found = pattern.search(line) + if found: + include_file = found.group(1) + data = files_dict[include_file].file_contents + updated_file.extend(data) + else: + updated_file.append(line) + file_count += 1 + + # Check if all include are replaced. + if file_count == len(tmp_file): + done = True + + # Update temp file + tmp_file = updated_file + + # Append and prepend string literal identifiers and add expanded file to final list + tmp_file.insert(0, "R\"(\n") + tmp_file.append("\n)\"") + entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) + final_files.append((file[0], entry)) + + # Write output files + for file in final_files: + with open(file[1].target_name, 'w+') as out_file: + out_file.write("\n".join(file[1].file_contents)) + + +# Generate embed files +cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') +cl_files += glob.glob('src/core/CL/cl_kernels/*.h') + +# DEBUG: print cl files +print("cl_files:") +print(cl_files) + +embed_files = [f + "embed" for f in cl_files] +print("embed_files:") +print(embed_files) + +resolve_includes(embed_files, cl_files) diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 000000000..7d4760600 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { + // ARMComputeEx kernels + {"arg_op", "arg_operation.cl"}, + {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast", "cast.cl"}, + {"cast_qasymm_in", "cast.cl"}, + {"cast_qasymm_out", "cast.cl"}, + {"comparison_op", "comparison_op.cl"}, + {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, + {"depth_to_space_nchw", "depth_to_space.cl"}, + {"depth_to_space_nhwc", "depth_to_space.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"permute_generic", "permute_ex.cl"}, + {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, + {"prelu", "prelu.cl"}, + {"prelu_qasymm8", "prelu_quantized.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"space_to_batch_4d_nchw", "space_to_batch.cl"}, + {"space_to_batch_4d_nhwc", "space_to_batch.cl"}, + {"space_to_depth_nchw", "space_to_depth.cl"}, + {"space_to_depth_nhwc", "space_to_depth.cl"}, +}; + +const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { +#ifdef EMBEDDED_KERNELS + { + "arg_operation.cl", +#include "./cl_kernels/arg_operation.clembed" + }, + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "depth_to_space.cl", +#include "./cl_kernels/depth_to_space.clembed" + }, + { + "gather_ex.cl", +#include "./cl_kernels/gather_ex.clembed" + }, + { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "instance_normalization_ex.cl", +#include "./cl_kernels/instance_normalization_ex.clembed" + }, + { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "prelu.cl", +#include "./cl_kernels/prelu.clembed" + }, + { + "prelu_quantized.cl", +#include "./cl_kernels/prelu_quantized.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "space_to_batch.cl", +#include "./cl_kernels/space_to_batch.clembed" + }, + { + "space_to_depth.cl", +#include "./cl_kernels/space_to_depth.clembed" + }, + { + "topkv2.cl", +#include "./cl_kernels/topkv2.clembed" + }, + { + "topkv2_radixsort.cl", +#include "./cl_kernels/topkv2_radixsort.clembed" + }, + { + "topkv2_quicksort.cl", +#include "./cl_kernels/topkv2_quicksort.clembed" + }, + +#endif /* EMBEDDED_KERNELS */ +}; + +CLKernelLibraryEx::CLKernelLibraryEx() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ + opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the + // CLKernelLibraryEx is built +} + +CLKernelLibraryEx &CLKernelLibraryEx::get() +{ + static CLKernelLibraryEx _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, + const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if (_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + std::string concat_str; + + if (fp16_supported()) + { + concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; + } + + if (get_cl_version(_device) == CLVersion::CL20) + { + concat_str += " -cl-std=CL2.0 "; + } + else if (arm_non_uniform_workgroup_supported(_device)) + { + concat_str += " -cl-arm-non-uniform-work-group-size "; + } + else + { + ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set) + concat_str; + + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if (_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, + cl::Program program) +{ + _built_programs_map.emplace(built_program_name, program); +} + +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + +const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if (program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if (_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if (std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, + std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if (std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const +{ + std::string concat_set; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for (const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} + +std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) +{ + const auto program_source_it = _program_source_map.find(program_name); + + if (program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + return program_source_it->second; +} + +size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const +{ + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG( + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); + + return result; +} + +cl::NDRange CLKernelLibraryEx::default_ndrange() const +{ + // GPUTarget _target = get_target_from_device(_device); + cl::Device device = cl::Device::getDefault(); + GPUTarget _target = get_target_from_device(device); + cl::NDRange default_range; + + switch (_target) + { + case GPUTarget::MIDGARD: + case GPUTarget::T600: + case GPUTarget::T700: + case GPUTarget::T800: + default_range = cl::NDRange(128u, 1); + break; + default: + default_range = cl::NullRange; + } + + return default_range; +} + +std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl new file mode 100644 index 000000000..2a6dfc91f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform arg_max/arg_min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. + * e.g. -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: + * U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element + * in the source image + * @param[in] input_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. + * Supported data types: U32 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ + +__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis, + const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + DATA_TYPE tval = value; + int idx = 0; + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // ArgMax + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#elif OP_CODE == 2 // ArgMin + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); +#else + return; + +#endif + + if (tval != value) + { + idx = indices[axis]; + tval = value; + } + } + + *((__global uint *)out.ptr) = idx; +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl new file mode 100644 index 000000000..77e239f55 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers_asymm.h" + +#ifdef SATURATE +#define ADD(x, y) add_sat((x), (y)) +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define ADD(x, y) (x) + (y) +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8 + * + * The following computations will be performed: + * + * -# Add offset terms to inputs + -# Get scaled value of two inputs + * -# Add inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The number of bits to shift left of input tensors must be passed at compile time using + * -DLEFT_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of input tensors + * must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, + -DIN2_OFFSET, + * -RIN2_MULT_INT and -DIN2_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + -DRESULT_SHIFT + * + * @attention The input and output data_types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The inputs and output scale information of qasymm8 need to be passed at compile time + * using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: + * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f + * @attention The inputs and output scale offset need to be passed at compile time using + * -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: + * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise + * wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] in1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: + * QASYMM8 + * @param[in] in2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] out_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] out_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Get scaled value of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + + VEC_DATA_TYPE(int, 16) + left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); + VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; + VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; + + VEC_DATA_TYPE(int, 16) + scaled_in1_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); + VEC_DATA_TYPE(int, 16) + scaled_in2_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); + + // Add inputs and multiply with a multiplier smaller than 1 + VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; + VEC_DATA_TYPE(int, 16) + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..8c875516d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input2_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + */ +__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#else // OP NOT SUPPORTED + return + +#endif +} +#endif // if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..2342fda9f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef SCALE +#define SCALE 1.0f +#endif +#ifndef OFFSET +#define OFFSET 0 +#endif +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) +/** Perform a cast operation on an input tensor. + * + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention -DBOOL_INPUT : Whether type of input is bool. + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +#if defined(BOOL_INPUT) + VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE)); + VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1); + res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)); +#endif // defined(BOOL_INPUT) + + VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr); +} + +/** Perform a cast operation on an QASYMM8 input tensor. + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of input should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + + VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; + + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); +} + +/** Perform a cast operation on an QASYMM8 output tensor. + * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int + * @attention Offset and Scale of output should be given as a preprocessor argument using + * -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE); + + VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); + + VSTORE(VEC_SIZE) + (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)output.ptr); +} +#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl new file mode 100644 index 000000000..e005322f7 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + out_index[0] = get_global_id(0); // W + out_index[1] = get_global_id(1); // H + out_index[2] = get_global_id(2) % Z_OUT; // C + out_index[3] = get_global_id(2) / Z_OUT; // B + + in_index[0] = out_index[0] / BLOCK_SIZE; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] + + ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +/** Perform space to depth rearrangement of tensor (NHWC) + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument + * using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + out_index[0] = get_global_id(0); // C + out_index[1] = get_global_id(1); // W + out_index[2] = get_global_id(2) % Z_OUT; // H + out_index[3] = get_global_id(2) / Z_OUT; // B + + in_index[0] = out_index[0] + + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; + in_index[1] = out_index[1] / BLOCK_SIZE; + in_index[2] = out_index[2] / BLOCK_SIZE; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..dd8cb6d93 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + // lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl new file mode 100644 index 000000000..09f776156 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) + +/** Performs the Gather operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] input_stride_z Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] input_stride_w Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination + * tensor + */ +__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); +#endif +#elif AXIS == 1 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); +#endif +#elif AXIS == 2 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); +#endif +#elif AXIS == 3 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); +#endif +#endif // AXIS + + *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..73f29e3e5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS - 1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 000000000..0e123ae0a --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#pragma OPENCL EXTENSION cl_arm_printf : enable +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) + +#define EXPAND(x) x + +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CL_VEC_DATA_TYPE_STR(type, size) type##size +#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0) + +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z, name##_stride_w, name##_step_w, mod_size) + +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ + mod_size) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Structure to hold 4D tensor information */ +typedef struct Tensor4D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ + int stride_w; /**< Stride of the image in W dimension (in bytes) */ +} Tensor4D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x) +{ + Vector vector = { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, + uint step_y, uint stride_z, uint step_z) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z) +{ + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z, uint stride_w, uint step_w, uint mod_size) +{ + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + + (get_global_id(2) / mod_size) * step_w; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +inline __global const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +/** Get the pointer position of a Tensor4D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + * @param[in] w Relative W position + */ +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + w * tensor->stride_w; +} + +#endif // _HELPER_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h new file mode 100644 index 000000000..c39138caa --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPERS_ASYMM_H +#define ARM_COMPUTE_HELPERS_ASYMM_H + +#include "helpers.h" + +/** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + VEC_DATA_TYPE(int, size) \ + mask = (1 << exponent) - 1; \ + const VEC_DATA_TYPE(int, size) zero = 0; \ + const VEC_DATA_TYPE(int, size) one = 1; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + +/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), + * rounding to the nearest value, and saturating -1 * -1 to the maximum value. + * + * @param[in] size Size of vector. + * + * @return Product of two fixed-point numbers. + */ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* COMPMID-907 */ \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ + } + +/** Each bit of the result is set to the corresponding bit of either then_val or + * else_val depending on whether the corresponding bit of if_mask is set. + * Equivalent to the VBSL instruction in ARM NEON. + * + * @param[in] size Size of vector. + * + * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding + * bit in @p if_mask is set or not. + */ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ + VEC_DATA_TYPE(int, size) then_val, \ + VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is zero. + */ +#define ASYMM_MASK_IF_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a == 0); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is non-zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is non zero. + */ +#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a != 0); \ + } + +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK( \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ + } + +/** Calculates \f$ exp(x) \f$ for x < 0. + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ + a_mod_quarter_minus_one_quarter_scaled, size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ + size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ + } + +/** Calculates the product of a integer value by a power of two, with either a positive exponent + * (equivalent to an arithmetic left shift, saturating) or a negative exponent + * (equivalent to an arithmetic right shift, rounding to nearest). + * + * @param[in] size Size of vector. + * + * @return Arithmetic left or right shift. + */ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ + } + +/** Calculates (a+b)/2, rounded to the nearest integer. + * Equivalent to VRHADD in the ARM NEON instruction set. + * + * @param[in] size Size of vector. + * + * @return (a+b)/2, rounded to the nearest integer. + */ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ + } + +/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ + VEC_DATA_TYPE(int, size) \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ + const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ + VEC_DATA_TYPE(int, size) \ + x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ + for (int i = 0; i < 3; i++) \ + { \ + VEC_DATA_TYPE(int, size) \ + half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ + VEC_DATA_TYPE(int, size) \ + one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ + VEC_DATA_TYPE(int, size) \ + tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ + x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ + } \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ + } + +/** Considering the integer value as fixed-point, change the number of integer bits and update value + * accordingly. + * + * @param[in] size Size of vector. + * + * @return Rescaled value. + */ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ + int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) +#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ + asymm_exp_on_negative_values##size(a, k_integer_bits) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + +ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(4) +ASYMM_MULT_IMPL(8) +ASYMM_MULT_IMPL(16) + +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + +ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(4) +ASYMM_SELECT_USING_MASK_IMPL(8) +ASYMM_SELECT_USING_MASK_IMPL(16) + +ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(4) +ASYMM_MASK_IF_ZERO_IMPL(8) +ASYMM_MASK_IF_ZERO_IMPL(16) + +ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(4) +ASYMM_MASK_IF_NON_ZERO_IMPL(8) +ASYMM_MASK_IF_NON_ZERO_IMPL(16) + +EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(4) +EXP_BARREL_SHIFTER_IMPL(8) +EXP_BARREL_SHIFTER_IMPL(16) + +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) + +ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(4) +ASYMM_ROUNDING_HALF_SUM_IMPL(8) +ASYMM_ROUNDING_HALF_SUM_IMPL(16) + +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + +ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(4) +ASYMM_RESCALE_IMPL(8) +ASYMM_RESCALE_IMPL(16) + +#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl new file mode 100644 index 000000000..1d96150f8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) +/** This function normalizes the input 2D tensor across the first dimension with respect to mean and + * standard deviation of the same dimension. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. + * -DDATA_TYPE=float + * @attention Normalization epsilon parameter should be given as a preprocessor argument with + * -DEPSILON=value. e.g. -DEPSILON=0.001f + * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, + * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr (Optional) Pointer to the destination tensor. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x (Optional) Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y (Optional) output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z (Optional) output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in + * the destination tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor. + * Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X + * dimension (in bytes) + * @param[in] gamma_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in + * the gamma tensor + * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported + * data types: same as @p input_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X + * dimension (in bytes) + * @param[in] beta_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in + * the beta tensor + */ +__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR4D_DECLARATION(output) +#endif /* IN_PLACE */ +#ifdef GAMMA + , + VECTOR_DECLARATION(gamma) +#endif // GAMMA +#ifdef BETA + , + VECTOR_DECLARATION(beta) +#endif // BETA + ) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +#ifndef IN_PLACE + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +#endif /* IN_PLACE */ + + float sum = 0.f; + float sum_sq = 0.f; + +#if defined(NHWC) + + const int ch = get_global_id(0); // Current channel + const int batch = get_global_id(2); // Current batch + const int elements_plane = DIM_Y * DIM_Z; + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); + sum += data; + sum_sq += data * data; + } + } + +#else // !defined(NHWC) + const int ch = get_global_id(2) % DIM_Z; // Current channel + const int batch = get_global_id(2) / DIM_Z; // Current batch + const int elements_plane = DIM_X * DIM_Y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum = 0.f; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum_sq = 0.f; + // Calculate partial sum + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum += data; + part_sum_sq += data * data; + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum.s0 += data; + part_sum_sq.s0 += data * data; + } + } +// Perform reduction +#if VEC_SIZE > 8 + part_sum.s01234567 += part_sum.s89abcdef; + part_sum_sq.s01234567 += part_sum_sq.s89abcdef; +#endif // VEC_SIZE > 8 +#if VEC_SIZE > 4 + part_sum.s0123 += part_sum.s4567; + part_sum_sq.s0123 += part_sum_sq.s4567; +#endif // VEC_SIZE > 4 +#if VEC_SIZE > 2 + part_sum.s01 += part_sum.s23; + part_sum_sq.s01 += part_sum_sq.s23; +#endif // VEC_SIZE > 2 + part_sum.s0 += part_sum.s1; + part_sum_sq.s0 += part_sum_sq.s1; + + sum = (float)part_sum.s0; + sum_sq = (float)part_sum_sq.s0; + +#endif // defined(NHWC) + + const float mean_float = (sum / elements_plane); + const DATA_TYPE mean = (DATA_TYPE)mean_float; + const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float); +#if defined(GAMMA) + const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON); + const DATA_TYPE multip = (DATA_TYPE)multip_float; +#else // !defined(GAMMA) + const DATA_TYPE multip = (DATA_TYPE)0; +#endif // defined(GAMMA) +#if defined(BETA) + const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch); +#else // !defined(BETA) + const DATA_TYPE beta = 0; +#endif // defined(BETA) + +#if defined(NHWC) + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } + +#else // !defined(NHWC) + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, input_address); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = (data - mean) * multip + beta; + VSTORE(VEC_SIZE) + (res, 0, output_address); + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } +#endif // defined(NHWC) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..4aa7883c3 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination + * image + * + */ +__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl new file mode 100644 index 000000000..2074d3ceb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers_asymm.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of + * GEMMLowp to QASYMM8 + * + * The following computations will be performed by the kernel: + * + * -# Add offset terms to inputs + * -# Multiply inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and + * -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + * -DRESULT_SHIFT + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + + // Multiply with a multiplier smaller than 1 + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl new file mode 100644 index 000000000..62a8901f6 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Can only take floating point data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : F16/F32 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image + * + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0 + ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) * + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr) + : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), + 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl new file mode 100644 index 000000000..5e0abd585 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" +#define SUB(x, y) (x) - (y) + +#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \ + defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +#define SELECT_TYPE VEC_INT + +/** Returns result of prelu function implemented as below: + * f(input) = alpha * input for input < 0, f(input) = input for input >= 0. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. + * -DDATA_TYPE_IN=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Can only take uchar data types. + * + * @param[in] input1_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] input1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] alpha_ptr Pointer to the source image. Supported Data + * types : QASYMM8 + * @param[in] alpha_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] alpha_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] alpha_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] alpha_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] alpha_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] alpha_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] alpha_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT); + VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT); + + in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN)); + alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA)); + + const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN); + const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA); + const VEC_FLOAT outf32 = + select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE)); + const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT)); + const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR); + + VSTORE(VEC_SIZE) + (res, 0, (__global uchar *)output.ptr); +} + +#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && + // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..d7ea2e2c4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#else // OP NOT SUPPORTED + return; + +#endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for (int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *( + (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + +#if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + +#elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + +#else // OP NOT SUPPORTED + return; + +#endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl new file mode 100644 index 000000000..7367da7fb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) +/** Perform space to batch with input of 4D and NCHW format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[1] = get_global_id(1) * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[2] = get_global_id(2) % DEPTH_OUT; + in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN; + + if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN) + { + *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE; + } + else + { + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset( + &in, in_index[0], in_index[1], in_index[2], in_index[3])); + } +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) + +#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && \ + defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) +/** Perform space to batch with input of 4D and NHWC format + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using + * -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16 + * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. + * e.g. -DBATCH_IN=16 + * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DHEIGHT_IN=16 + * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. + * e.g. -DWIDTH_IN=16 + * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along + * W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: same as @p + * input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements + * along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] block_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] block_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] block_size_step_x block_size_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] block_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] padding_size_ptr Pointer to the source tensor. Supported + * data types: S32 + * @param[in] padding_size_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] padding_size_step_x padding_size_stride_x * number of + * elements along X processed per workitem + * (in bytes) + * @param[in] padding_size_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] padding_size_step_y padding_size_stride_y * number of + * elements along Y processed per workitem + * (in bytes) + * @param[in] padding_size_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(block_size), + IMAGE_DECLARATION(padding_size)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT); + + int block_size_x = *((__global int *)(block_size_ptr)); + int block_size_y = *((__global int *)(block_size_ptr + block_size_stride_x)); + int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x; + int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x; + + int in_index[4] = { + 0, + }; + in_index[0] = get_global_id(0) * VEC_SIZE; + in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr)); + in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - + *((__global int *)(padding_size_ptr + padding_size_stride_y)); + in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN; + + if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN) + { + VSTORE(VEC_SIZE) + ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr); + } + else + { + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], + in_index[2], in_index[3])), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)out.ptr); + } +} + +#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && + // defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl new file mode 100644 index 000000000..a26e762e8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + in_index[0] = get_global_id(0); // W + in_index[1] = get_global_id(1); // H + in_index[2] = get_global_id(2) % Z_IN; // C + in_index[3] = get_global_id(2) / Z_IN; // B + + out_index[0] = in_index[0] / BLOCK_SIZE; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = + in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) + +#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN) +/** Perform space to depth rearrangement of tensor + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. + * e.g. -DDEPTH_IN=16 + * @attention The value of the z-axis of input tensor depth should be given as a preprocessor + * argument using -DZ_IN=size. e.g. -DZ_IN=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. + * -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + */ +__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + int out_index[4] = {0}; + int in_index[4] = {0}; + + in_index[0] = get_global_id(0); // C + in_index[1] = get_global_id(1); // W + in_index[2] = get_global_id(2) % Z_IN; // H + in_index[3] = get_global_id(2) / Z_IN; // B + + out_index[0] = + in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN; + out_index[1] = in_index[1] / BLOCK_SIZE; + out_index[2] = in_index[2] / BLOCK_SIZE; + out_index[3] = in_index[3]; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], + out_index[3])) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl new file mode 100644 index 000000000..50472e4f9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf, + __global int *in_ind_buf, const int n) +{ + int gid = get_global_id(0); + int lws = get_local_size(0); + int groups = get_num_groups(0); + int gws = lws * groups; + int iter = n / gws; + + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + for (int i = 0; i < iter; ++i) + { + int idx = i * gws + gid; + in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x); + in_ind_buf[idx] = idx; + } +} + +__kernel void topkv2_find_first_negative(__global float *out_key_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + if (gid == n - 1) + { + // if the last item is positive, the first negative index is n. + if (out_key_buf[gid] > 0.f) + *first_negative_idx = n; + } + else if (gid == 0) + { + // if the first item is negative, set it 0. + if (out_key_buf[gid] < 0.f) + *first_negative_idx = 0; + } + else + { + // if its left is positive and it is negative, then it is the first negative item. + if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f) + *first_negative_idx = gid; + } +} + +__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf, + __global float *in_ind_buf, __global float *out_ind_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + int num_negs = n - *first_negative_idx; + int in_idx; + + if (gid < num_negs) + { + in_idx = n - 1 - gid; + } + else + { + in_idx = gid - num_negs; + } + + out_key_buf[gid] = in_key_buf[in_idx]; + out_ind_buf[gid] = in_ind_buf[in_idx]; +} + +__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices), + __global float *out_key_buf, __global int *out_ind_buf, int n) +{ + int gid = get_global_id(0); + + Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); + Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + + int idx = n - 1 - gid; + + *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl new file mode 100644 index 000000000..9594daf19 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helpers.h" + +__global inline float *get_vec_elem(Vector *vec, int idx) +{ + return (__global float *)(vec->ptr + idx * vec->stride_x); +} + +__global inline int *get_vec_elem_int(Vector *vec, int idx) +{ + return (__global int *)(vec->ptr + idx * vec->stride_x); +} + +// A utility function to swap two elements +void swap(__global float *a, __global float *b) +{ + float t = *a; + *a = *b; + *b = t; +} + +void swap_idx(__global int *a, __global int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* This function is same in both iterative and recursive*/ +int partition(Vector *arr, __global int *indices, int l, int h) +{ + float x = *get_vec_elem(arr, h); + int i = (l - 1); + + for (int j = l; j <= h - 1; j++) + { + if (*get_vec_elem(arr, j) >= x) + { + i++; + swap(get_vec_elem(arr, i), get_vec_elem(arr, j)); + swap_idx(&indices[i], &indices[j]); + } + } + swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); +} + +/* A[] --> Array to be sorted, + l --> Starting index, + h --> Ending index */ +void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h) +{ + // Create an auxiliary stack + + // initialize top of stack + int top = -1; + + // push initial values of l and h to stack + stack[++top] = l; + stack[++top] = h; + + // Keep popping from stack while is not empty + while (top >= 0) + { + // Pop h and l + h = stack[top--]; + l = stack[top--]; + + // Set pivot element at its correct position + // in sorted array + int p = partition(arr, indices, l, h); + + // If there are elements on left side of pivot, + // then push left side to stack + if (p - 1 > l) + { + stack[++top] = l; + stack[++top] = p - 1; + } + + // If there are elements on right side of pivot, + // then push right side to stack + if (p + 1 < h) + { + stack[++top] = p + 1; + stack[++top] = h; + } + } +} + +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values), + VECTOR_DECLARATION(topk_indices), __global int *indices, + __global int *temp_stack, int k, int n) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); + Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); + + for (int i = 0; i < n; ++i) + { + indices[i] = i; + } + + quickSortIterative(&input, indices, temp_stack, 0, n - 1); + + // extract k items. + for (int i = 0; i < k; ++i) + { + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem_int(&topk_indices, i) = indices[i]; + } +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl new file mode 100644 index 000000000..f6830d229 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// reference: +// https://code.google.com/archive/p/ocl-radix-sort/source/default/source +// OpenCL kernel sources for the CLRadixSort class +// the #include does not exist in OpenCL +// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr +// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html +// if you find this software usefull you can cite the following work in your reports or articles: +// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. +// http://hal.archives-ouvertes.fr/hal-00596730 + +// Reference for floating point radix sort: +// http://www.codercorner.com/RadixSortRevisited.htm + +// compute the histogram for each radix and each virtual processor for the pass +__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, + const int pass, __local int *loc_histo, const int n) +{ + int it = get_local_id(0); // i local number of the processor + int ig = get_global_id(0); // global number = i + g I + + int gr = get_group_id(0); // g group number + + int groups = get_num_groups(0); + int items = get_local_size(0); + + // set the local histograms to zero + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // range of keys that are analyzed by the work item + int size = n / groups / items; // size of the sub-list + int start = ig * size; // beginning of the sub-list + + unsigned int key; + int shortkey, k; + + // compute the index + // the computation depends on the transposition + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + + key = *((__global unsigned int *)(in_key_buf + k)); + + // extract the group of _BITS bits of the pass + // the result is in the range 0.._RADIX-1 + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + // increment the local histogram + loc_histo[shortkey * items + it]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // copy the local histogram to the global one + for (int ir = 0; ir < _RADIX; ir++) + { + d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// initial transpose of the list for improving +// coalescent memory access +__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, + const int nbrow, const __global int *inperm, __global int *outperm, + __local int *blockmat, __local int *blockperm, const int tilesize) +{ + + int i0 = get_global_id(0) * tilesize; // first row index + int j = get_global_id(1); // column index + + int jloc = get_local_id(1); // local column index + + // fill the cache + for (int iloc = 0; iloc < tilesize; iloc++) + { + int k = (i0 + iloc) * nbcol + j; // position in the matrix + blockmat[iloc * tilesize + jloc] = invect[k]; +#ifdef PERMUT + blockperm[iloc * tilesize + jloc] = inperm[k]; +#endif + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first row index in the transpose + int j0 = get_group_id(1) * tilesize; + + // put the cache at the good place + for (int iloc = 0; iloc < tilesize; iloc++) + { + int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose + outvect[kt] = blockmat[jloc * tilesize + iloc]; +#ifdef PERMUT + outperm[kt] = blockperm[jloc * tilesize + iloc]; +#endif + } +} + +// each virtual processor reorders its data using the scanned histogram +__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, + __global int *d_Histograms, const int pass, + __global int *indices_in, __global int *indices_out, + __local int *loc_histo, const int n) +{ + + int it = get_local_id(0); + int ig = get_global_id(0); + + int gr = get_group_id(0); + int groups = get_num_groups(0); + int items = get_local_size(0); + + int start = ig * (n / groups / items); + int size = n / groups / items; + + // take the histogram in the cache + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int newpos, shortkey, k, newpost; + unsigned int key; + + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + float org_value = in_key[k]; + key = *(__global unsigned int *)(in_key + k); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + newpos = loc_histo[shortkey * items + it]; + +#ifdef TRANSPOSE + int ignew, jnew; + ignew = newpos / (n / groups / items); + jnew = newpos % (n / groups / items); + newpost = jnew * (groups * items) + ignew; +#else + newpost = newpos; +#endif + + // d_outKeys[newpost]= key; // killing line !!! + out_key[newpost] = org_value; + +#ifdef PERMUT + indices_out[newpost] = indices_in[k]; +#endif + + newpos++; + loc_histo[shortkey * items + it] = newpos; + } +} + +// perform a parallel prefix sum (a scan) on the local histograms +// (see Blelloch 1990) each workitem worries about two memories +// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, + __global int *globsum) +{ + int it = get_local_id(0); + int ig = get_global_id(0); + int decale = 1; + int n = get_local_size(0) * 2; + int gr = get_group_id(0); + + // load input into local memory + // up sweep phase + temp[2 * it] = histo[2 * ig]; + temp[2 * it + 1] = histo[2 * ig + 1]; + + // parallel prefix sum (algorithm of Blelloch 1990) + for (int d = n >> 1; d > 0; d >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + temp[bi] += temp[ai]; + } + decale *= 2; + } + + // store the last element in the global sum vector + // (maybe used in the next step for constructing the global scan) + // clear the last element + if (it == 0) + { + globsum[gr] = temp[n - 1]; + temp[n - 1] = 0; + } + + // down sweep phase + for (int d = 1; d < n; d *= 2) + { + decale >>= 1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write results to device memory + + histo[2 * ig] = temp[2 * it]; + histo[2 * ig + 1] = temp[2 * it + 1]; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// use the global sum for updating the local histograms +// each work item updates two values +__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) +{ + int ig = get_global_id(0); + int gr = get_group_id(0); + + int s; + + s = globsum[gr]; + + // write results to device memory + histo[2 * ig] += s; + histo[2 * ig + 1] += s; + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp new file mode 100644 index 000000000..7f4b5b0df --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ArgOperation /*op*/) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) != + output->tensor_shape().num_dimensions(), + "Input's rank is not same with output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + return Status{}; +} + +} // namespace + +CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel and set op_code based on type of ArgOperation as specified by object op + std::string kernel_name = "arg_op"; + int op_code = 0; + if (op == ArgOperation::MAX) + { + op_code = 1; + } + else if (op == ArgOperation::MIN) + { + op_code = 2; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ArgOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..c14e73634 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp new file mode 100644 index 000000000..35f607bd0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} + +void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set kernel build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + // Create kernel + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + const float scale_in = input->info()->quantization_info().scale; + const int offset_in = input->info()->quantization_info().offset; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options())); + } + else if (is_data_type_quantized_asymmetric(output->info()->data_type())) + { + const float scale_in = output->info()->quantization_info().scale; + const int offset_in = output->info()->quantization_info().offset; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options())); + } + else + { + build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT"); + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast", build_opts.options())); + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp new file mode 100644 index 000000000..2a3433c2b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +// TODO Use this validation function +#if 0 +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size, + "Output width should be equal to (Input width * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size, + "Output height should be equal to (Input height * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0, + "Input depth should be divisible by (block size * block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->dimension(2) != input->dimension(2) / (block_size * block_size), + "Output depth should be equal to (Input depth / (block size * block size))"); + + return Status{}; +} +#endif +} // namespace + +CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + // TODO Add validation of data_layout + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = output->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = output->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..0862b78bf --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp new file mode 100644 index 000000000..718f615f9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/core/UtilsEx.h" + +using namespace arm_compute; + +namespace +{ + +inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, + ITensorInfo *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + std::unique_ptr<ITensorInfo> output_info = input->clone(); + output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis)); + // Output auto initialization if not yet initialized + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); + + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} + +} // namespace + +CLGatherExKernel::CLGatherExKernel() + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +{ +} + +void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices, + ICLTensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), indices->info(), output->info(), axis)); + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input = input; + _output = output; + _indices = indices; + _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions())); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} + +Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + indices->clone().get(), + output->clone().get(), axis) + .first); + return Status{}; +} + +void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..31e98c9a8 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() +{ + // DO NOTHING +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) || + output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..5db414f62 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_UNUSED(gamma); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) +{ +} + +void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + _run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), + gamma ? gamma->info() : nullptr, + beta ? beta->info() : nullptr, epsilon)); + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); + build_opts.add_option_if(gamma, "-DGAMMA"); + build_opts.add_option_if(beta, "-DBETA"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window collapsed_window = window.collapse(window, Window::DimZ); + + // We will process the planes together + if (_input->info()->data_layout() == DataLayout::NCHW) + { + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + } + else + { + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1)); + } + + Window vec_window; + vec_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, collapsed_window); + if (!_run_in_place) + { + add_4D_tensor_argument(idx, _output, collapsed_window); + } + if (_gamma) + { + add_1D_tensor_argument(idx, _gamma, vec_window); + } + if (_beta) + { + add_1D_tensor_argument(idx, _beta, vec_window); + } + + enqueue(queue, *this, collapsed_window, lws_hint()); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..ecfe05a51 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp new file mode 100644 index 000000000..e7d587029 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info())); + + _input = input; + _alpha = alpha; + _output = output; + + // Create kernel + std::string kernel_name = "prelu"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + build_opts.emplace("-DOFF_IN=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().offset)); + build_opts.emplace("-DOFF_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().offset)); + build_opts.emplace("-DSCALE_IN=" + + support::cpp11::to_string(input->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_ALPHA=" + + support::cpp11::to_string(alpha->info()->quantization_info().scale)); + build_opts.emplace("-DSCALE_OUT=" + + support::cpp11::to_string(output->info()->quantization_info().scale)); + kernel_name += "_qasymm8"; + } + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), out_shape); + + if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16) + { + set_format_if_unknown(*output->info(), Format::F16); + } + else if (input->info()->data_type() == DataType::F32 || + alpha->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info()); + + AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input->info()->tensor_shape(); + const TensorShape &in_shape2 = _alpha->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_input1); + add_3D_tensor_argument(idx, _alpha, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPReLUKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input->info()->dimension(0), _alpha->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..24e89db28 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReduceOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReduceOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReduceOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReduceOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReduceOperation::MEAN) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp new file mode 100644 index 000000000..f7836b6cd --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size, + const ITensorInfo *padding_size, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::F16, DataType::S32, + DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(), + "The number of dimensions of input should be equal to output"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(), + "The input and output layouts are different!"); + + // TODO Support other cases + if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0), + "Input Depth should be equal to Output Depth"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 || + padding_size->dimension(1) != 2, + "Only 2-dimensional spatial block's size was wrong"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4, + "CLSpaceToBatchNDKernel supports dimensions up to 4"); + + if (input->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(), + "The input and output quantization info are different!"); + } + + return Status{}; +} + +} // namespace + +CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() +{ + // DO NOTHING +} + +void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info())); + + _input = input; + _block_size = block_size; + _padding_size = padding_size; + _output = output; + + // Set kernel build options + // TODO Support other cases + std::string kernel_name = "space_to_batch_4d"; + std::set<std::string> build_opts; + Window win; + + if (input->info()->data_layout() == DataLayout::NCHW) + { + kernel_name += "_nchw"; + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0))); + + win = calculate_max_window(*output->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + } + else if (input->info()->data_layout() == DataLayout::NHWC) + { + kernel_name += "_nhwc"; + build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.emplace("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->info()->valid_region()); + + if (window_changed) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!"); + } + } + else + { + ARM_COMPUTE_ERROR("Unsupported layout"); + } + + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3))); + if (input->info()->data_type() == DataType::QASYMM8) + { + build_opts.emplace("-DZERO_VALUE=" + + support::cpp11::to_string(input->info()->quantization_info().offset)); + } + else + { + build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0)); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + ICLKernel::configure_internal(win); +} + +void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + const_cast<ICLTensor *>(_block_size)->map(queue); + const_cast<ICLTensor *>(_padding_size)->map(queue); + + const size_t num_dimensions = _input->info()->num_dimensions(); + const size_t num_spacial_dimensions = _block_size->info()->dimension(0); + uint32_t batch_size = _input->info()->dimension(num_dimensions - 1); + for (size_t i = 0; i < num_spacial_dimensions; ++i) + { + const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i})); + const int32_t padding_size_pre = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i})); + const int32_t padding_size_post = + *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i})); + + ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1"); + ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0, + "Padding size should be greater than or equal to 0"); + + if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(i) != + (_input->info()->dimension(i) + padding_size_pre + padding_size_post) / block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + else + { + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) != + (_input->info()->dimension(num_dimensions - num_spacial_dimensions - 1 + i) + + padding_size_pre + padding_size_post) / + block_size, + "Dimension value of spatial block does not match output's dimension value"); + } + + batch_size *= block_size; + } + ARM_COMPUTE_ERROR_ON_MSG( + _output->info()->dimension(num_dimensions - 1) != batch_size, + "Output batch size should be equal to input batch size * (multiplication of all block size)"); + + const_cast<ICLTensor *>(_block_size)->unmap(queue); + const_cast<ICLTensor *>(_padding_size)->unmap(queue); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + + Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Set block size window + Window win_block = calculate_max_window(*_block_size->info(), Steps()); + + // Set padding size window + Window win_padding = calculate_max_window(*_padding_size->info(), Steps()); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + add_1D_tensor_argument(idx, _block_size, win_block); + add_2D_tensor_argument(idx, _padding_size, win_padding); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp new file mode 100644 index 000000000..b085192a2 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const int32_t block_size) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1, + "Block size should be greater than or equal to 1."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3), + "Input batch should be equal to Output batch"); + + auto layout_out = input->data_layout(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT); + auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth), + "Output depth should be equal to (input depth * block size *block size)"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) || + (input->dimension(index_height) % block_size), + "Input height and width should be divisible by block size"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (output->dimension(index_width) != (input->dimension(index_width) / block_size)) || + (output->dimension(index_height) != (input->dimension(index_height) / block_size)), + "Output height and width should be equal to " + "input_height/blocksize and input_width/blocksize respectively"); + + return Status{}; +} + +} // namespace + +CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {} + +void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output, + const int32_t block_size) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size)); + + _input = input; + _output = output; + + // Set kernel build options + auto layout_out = input->info()->data_layout(); + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = input->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z())); + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup output slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..4f2b388c9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute +#endif // Disable GPU implementation diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp new file mode 100644 index 000000000..6cc8d9d13 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel() + : _input(nullptr), _output(nullptr), _inner_border(), _info() +{ +} + +Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c)); + for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, + "inner_border_right must be smaller that stride_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, + "inner_border_top must be smaller that stride_y"); + + return Status{}; +} + +void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _inner_border = inner_border; + _info = info; + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate( + input->info(), output->info(), inner_border, info)); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + _kernel = static_cast<cl::Kernel>( + CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options())); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + const int out_start_x = _info.pad_left(); + const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right - + _info.pad_right() + _info.stride().first - 1; + const int out_step_x = _info.stride().first; + + const int out_start_y = _inner_border.top + _info.pad_top(); + const int out_end_y = + _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1; + const int out_step_y = _info.stride().second; + + switch (data_layout) + { + case DataLayout::NCHW: + { + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + + Window slice_out = collapsed.first_slice_window_3D(); + slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (collapsed.slide_window_slice_3D(slice_in) && + collapsed.slide_window_slice_3D(slice_out)); + break; + } + case DataLayout::NHWC: + { + // NOTE: not collapsing in NHWC + Window slice_out = window.first_slice_window_3D(); + slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x)); + slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y)); + + Window slice_in = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out); + } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out)); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data layout"); + } +} diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp new file mode 100644 index 000000000..8ac667ceb --- /dev/null +++ b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {} + +bool CPPUpsampleKernelEx::is_parallelisable() const { return false; } + +void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _input = input; + _output = output; + _info = info; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICPPKernel::configure(win); +} + +void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + // Initialize _scaled_output buffer + const int width_scaled = _output->info()->dimension(0); + const int height_scaled = _output->info()->dimension(1); + const int stride_x = _info.stride().first; + const int stride_y = _info.stride().second; + const int start_x = _info.pad_left(); + const int start_y = _info.pad_top(); + const int end_y = height_scaled - _info.pad_bottom(); + const int end_x = width_scaled - _info.pad_top(); + const size_t element_size = _input->info()->element_size(); + + // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset + const uint8_t fill_value = + _output->info()->data_type() == DataType::QASYMM8 + ? utility::clamp<uint8_t>(_output->info()->quantization_info().offset) + : 0; + // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte + // values in a buffer of uint8_ts + std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value); + + // Create window + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x)); + window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y)); + + // Create iterators + Iterator in(_input, window); + Iterator out(_output, window_out); + + execute_window_loop( + window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp new file mode 100644 index 000000000..4508f5800 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" + +#include <algorithm> +#include "arm_compute/core/Types.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +namespace +{ +void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +using namespace arm_compute; +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op_templ( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_value, + output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = + reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = + reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, + input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +} // namespace + +namespace arm_compute +{ + +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, + const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; + return out; +} + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, + const float32x4_t &invscale) +{ + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; + store_quantized_int32(output_ptr, out); +} + +float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale) +{ + const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + const int32x4_t voffset = vdupq_n_s32(offset); + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t broadcast_vector = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_low_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( + vmovl_u8(vget_high_u8(broadcast_value_vec))))), + voffset)), + vscale), + }}; + return broadcast_vector; +} + +void elementwise_op_quantized( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, + float32x4_t, float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, + int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + const float output_scale = out->info()->quantization_info().scale; + const int output_offset = out->info()->quantization_info().offset; + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from + // zero) + const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info(); + const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto non_broadcast_input_ptr = + reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = + dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, + invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo.scale, + non_broadcast_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(broadcast_value, broadcast_qinfo.scale, broadcast_qinfo.offset); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, + out->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset); + const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset); + const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); + const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = + (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, + output_ptr, voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = + scvt_f32_qasymm8(*(input1_ptr + x), input1_qinfo.scale, input1_qinfo.offset); + const float bfs = + scvt_f32_qasymm8(*(input2_ptr + x), input2_qinfo.scale, input2_qinfo.offset); + *(output_ptr + x) = (*scalar_func)(afs, bfs, out->info()->quantization_info()); + } + }, + input1, input2, output); + } +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)) +{ + elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) +{ + elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp new file mode 100644 index 000000000..d2f42de53 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace arm_compute +{ + +template <BinaryLogicalOperation op, typename ScalarType> +inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case BinaryLogicalOperation::AND: + res = a & b; + break; + case BinaryLogicalOperation::OR: + res = a | b; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op, typename VectorType> +inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + + switch (op) + { + case BinaryLogicalOperation::AND: + res = wrapper::vand(a, b); + break; + case BinaryLogicalOperation::OR: + res = wrapper::vorr(a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op> +inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) +{ + uint8x16x4_t out = {{ + elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_logic_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, + &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_logic_op_loop<op, ScalarType, VectorType>); +} + +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window) { func(input1, input2, output, window); }; + } + return nullptr; +} + +template <BinaryLogicalOperation op> +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> +configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + + return configure_func(input1, input2, output, map_function); +} + +void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, + const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); + configure_common(input1, input2, output); + switch (op) + { + case BinaryLogicalOperation::AND: + _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); + break; + case BinaryLogicalOperation::OR: + _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) +{ + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, + DataType::QASYMM8); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); + return Status{}; +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp new file mode 100644 index 000000000..7e4fc129b --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL && + input->data_type() != DataType::U8); + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::U32, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); + + // NECastKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_tuple(Status{}, win); +} + +typedef struct bool8x16 +{ + uint8x16_t val; +} bool8x16_t; + +static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; } + +template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; } +template <> inline uint8x16_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + return vshrq_n_u8(mask, 7); // true -> 1, false -> 0 +} + +template <> inline uint32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const bool8x16_t &v) +{ + const uint8x16_t vu8 = vreinterpretq_u8_b8(v); + const uint8x16_t zero_uint8x16 = vdupq_n_u8(0); + uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16); + uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0 + + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))), + }}; + + return ret; +} + +template <> inline uint32x4x4_t vcast(const uint8x16_t &v) +{ + const uint32x4x4_t ret = {{ + vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))), + vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))), + vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const uint8x16_t &v) +{ + const int32x4x4_t ret = {{ + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint8x16_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3])))); +} + +template <> inline uint32x4x4_t vcast(const int32x4x4_t &v) +{ + // Saturate cast + const uint32x4x4_t ret = {{ + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))), + vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))), + vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const int32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]), + vcvtq_f32_s32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const uint32x4x4_t &v) +{ + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))), + vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3])))); +} + +template <> inline int32x4x4_t vcast(const uint32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))), + vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))), + vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))), + }}; + + return ret; +} + +template <> inline float32x4x4_t vcast(const uint32x4x4_t &v) +{ + const float32x4x4_t ret = {{ + vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]), + vcvtq_f32_u32(v.val[3]), + }}; + + return ret; +} + +template <> inline uint8x16_t vcast(const float32x4x4_t &v) +{ + // Saturate cast + return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])), + vqmovun_s32(vcvtq_s32_f32(v.val[1])))), + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])), + vqmovun_s32(vcvtq_s32_f32(v.val[3]))))); +} + +template <> inline uint32x4x4_t vcast(const float32x4x4_t &v) +{ + const uint32x4x4_t ret = {{ + vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]), + vcvtq_u32_f32(v.val[3]), + }}; + + return ret; +} + +template <> inline int32x4x4_t vcast(const float32x4x4_t &v) +{ + const int32x4x4_t ret = {{ + vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]), + vcvtq_s32_f32(v.val[3]), + }}; + + return ret; +} + +template <typename T> struct cast_vector; +template <> struct cast_vector<bool> +{ + using type = bool8x16_t; +}; +template <> struct cast_vector<uint8_t> +{ + using type = uint8x16_t; +}; +template <> struct cast_vector<uint32_t> +{ + using type = uint32x4x4_t; +}; +template <> struct cast_vector<int32_t> +{ + using type = int32x4x4_t; +}; +template <> struct cast_vector<float> +{ + using type = float32x4x4_t; +}; + +template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v) +{ + wrapper::vstore(ptr, v); +} + +inline bool8x16_t vloadq(const bool *ptr) +{ + bool8x16_t ret; + ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr)); + return ret; +} + +template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr) +{ + return wrapper::vloadq(ptr); +} + +template <> inline typename cast_vector<bool>::type load_input(const bool *ptr) +{ + return vloadq(ptr); +} + +template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr) +{ + return vld4q_u32(ptr); +} + +template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr) +{ + return vld4q_s32(ptr); +} + +template <> inline typename cast_vector<float>::type load_input(const float *ptr) +{ + return vld4q_f32(ptr); +} + +template <typename T> inline T get_value(const T *ptr) { return *ptr; } + +template <> inline bool get_value(const bool *ptr) +{ + bool ret = (*ptr != 0); + return ret; +} + +template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<FromT>::type; + const from_vector vin = load_input(in_ptr + x); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + FromT val = get_value(in_ptr + x); + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const QuantizationInfo &qinfo_out = output->info()->quantization_info(); + const auto qval = qinfo_out.quantize(static_cast<float>(val), rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} + +void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + const auto &qinfo_in = input->info()->quantization_info(); + const auto &qinfo_out = output->info()->quantization_info(); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + using from_vector = typename cast_vector<float>::type; + const auto vf = wrapper::vloadq(in_ptr + x); + const auto vin = vdequantize(vf, qinfo_in); + switch (output->info()->data_type()) + { + case DataType::U8: + { + using to_vector = typename cast_vector<uint8_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::QASYMM8: + { + using to_vector = typename cast_vector<float>::type; + const auto vf = vcast<to_vector, from_vector>(vin); + const auto vout = vquantize(vf, qinfo_out); + store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout); + break; + } + case DataType::U32: + { + using to_vector = typename cast_vector<uint32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::S32: + { + using to_vector = typename cast_vector<int32_t>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout); + break; + } + case DataType::F32: + { + using to_vector = typename cast_vector<float>::type; + const to_vector vout = vcast<to_vector, from_vector>(vin); + store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qasymm8_t qval_in = *(in_ptr + x); + const auto val = qinfo_in.dequantize(qval_in); + + switch (output->info()->data_type()) + { + case DataType::U8: + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val); + break; + } + case DataType::QASYMM8: + { + const auto qval_out = qinfo_out.quantize(val, rounding_policy); + *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out; + break; + } + case DataType::U32: + { + *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val); + break; + } + case DataType::S32: + { + *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val); + break; + } + case DataType::F32: + { + *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + } + }, + in, out); +} +} // namespace + +NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE) +{ +} + +void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype)); + + _input = input; + _output = output; + _input_subtype = input_subtype; + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + return Status{}; +} + +void NECastKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::U8: + if (_input_subtype == SubDataType::BOOL) + { + run_cast<bool>(_input, _output, window); + } + else + { + run_cast<uint8_t>(_input, _output, window); + } + break; + case DataType::QASYMM8: + run_cast_qasymm8(_input, _output, window); + break; + case DataType::U32: + run_cast<uint32_t>(_input, _output, window); + break; + case DataType::S32: + run_cast<int32_t>(_input, _output, window); + break; + case DataType::F32: + run_cast<float>(_input, _output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp new file mode 100644 index 000000000..8a2223c26 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + // Validate output if initialized + if (output->total_size() != 0) + { + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _output = output; + _block_shape = block_shape; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + ICPPKernel::configure(win); +} + +Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const int idx_channel = + get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL); + const int depth_size = _input->info()->dimension(idx_channel); + const int r = (depth_size / (_block_shape * _block_shape)); + const int element_size = _input->info()->element_size(); + + Window slice_out = window.first_slice_window_3D(); + + // The slice_out slice does not move + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Main loop for NCHW and NHWC + if (_input->info()->data_layout() == DataLayout::NCHW) + { + Window slice_in = window.first_slice_window_2D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); + } + else + { + Window slice_in = window.first_slice_window_3D(); + do + { + Iterator in(_input, slice_in); + execute_window_loop(slice_in, + [&](const Coordinates &id) { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp new file mode 100644 index 000000000..cebd614df --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <cstdint> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; + +namespace +{ +template <ElementWiseUnaryEx op, typename ScalarType> +inline ScalarType elementwise_op_scalar(const ScalarType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return -a; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename VectorType> +inline VectorType elementwise_op(const VectorType &a) +{ + switch (op) + { + case ElementWiseUnaryEx::NEG: + return wrapper::vneg(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <ElementWiseUnaryEx op, typename ScalarType> +void elementwise_op(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, + elementwise_op<op>(wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x)); + } + }, + input, output); +} + +template <ElementWiseUnaryEx op> +std::function<void(const ITensor *input, ITensor *output, const Window &window)> +configure_func(const ITensor *input, ITensor *output) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *> + map_function = { + {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>}, + }; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + map_function["op_F16_F16"] = &elementwise_op<op, float16_t>; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input, ITensor *output, const Window &window) { + func(input, output, window); + }; + } + return nullptr; +} +} // namespace + +NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx() + : _function(nullptr), _input(nullptr), _output(nullptr) +{ +} + +void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input, + ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info())); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _output = output; + + INEKernel::configure(win); + + switch (op) + { + case ElementWiseUnaryEx::NEG: + _function = configure_func<ElementWiseUnaryEx::NEG>(input, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, + DataType::S32); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + } + + return Status{}; +} + +Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output)); + return Status{}; +} + +void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_function == nullptr); + _function(_input, _output, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..5401afea0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() + : _input(nullptr), _lookups(nullptr), _output(nullptr) +{ +} + +void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output, + const ITensor *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Auto initialize output if not initialized + auto out_shape = input->info()->tensor_shape(); + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions()); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + const arm_compute::ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + return Status{}; +} + +void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const int32_t lookup = *reinterpret_cast<int32_t *>( + _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp new file mode 100644 index 000000000..ce2413dc1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the indices + * + * Validate that indices are not negative + * + * @param[in] indices Indices tensor info. + */ +template <typename U> void validate_indices(const ITensor *indices) +{ + for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i) + { + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0); + } +} + +} // namespace + +NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {} + +template <typename U> +inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), 0); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +template <typename U> +void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator output_it(_output, output_window); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices->info()->num_dimensions(), _axis); + + U new_index; + switch (_indices->info()->num_dimensions()) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + if (_axis < 0) + { + _axis += input->info()->num_dimensions(); + } + ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); + + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + + if (axis < 0) + { + axis += input->num_dimensions(); + } + + ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window, info); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp new file mode 100644 index 000000000..391337bfb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <unordered_map> + +using namespace arm_compute; + +namespace +{ +constexpr size_t NOT_HIT = 0xFFFFFFFF; +} // namespace + +NEHashtableLookupKernel::NEHashtableLookupKernel() + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} +{ +} + +void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys, + const ITensor *input, ITensor *output, ITensor *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Auto initialize output if not initialized + auto out_shape{input->info()->tensor_shape()}; + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Auto initialize hits if not initialized + auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1)); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + // Validate in case of configured hits + if (hits->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1)); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + } + + return Status{}; +} + +void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + const int const_0 = _output->info()->data_type() == DataType::QASYMM8 + ? _output->info()->quantization_info().offset + : 0; + + std::unordered_map<int32_t, size_t> key_index_map; + for (size_t n = 0; n < _keys->info()->dimension(0); ++n) + { + const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n})); + key_index_map[key] = n; + } + std::vector<size_t> lookup_indices; + for (size_t k = 0; k < _lookups->info()->dimension(0); ++k) + { + const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k})); + const auto it = key_index_map.find(key); + if (it == key_index_map.end()) + { + lookup_indices.emplace_back(NOT_HIT); + *_hits->ptr_to_element({k}) = 0; + } + else + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= _keys->info()->dimension(0)) + ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices.emplace_back(it->second); + *_hits->ptr_to_element({k}) = 1; + } + } + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..1ea77fb5c --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +template <typename T> +void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + const auto channel_idx = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + if (gamma != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), + "Gamma's size must be the same as size of input's channel"); + } + + if (beta != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), + "Beta's size must be the same as size of input's channel"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) +{ +} + +void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output, + ITensor *gamma, ITensor *beta, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + + if (_input->info()->data_type() == DataType::F32) + { + _func = &instance_normalization_nchw<float>; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (_input->info()->data_type() == DataType::F16) + { + _func = &instance_normalization_nchw<float16_t>; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + (*_func)(_input, _output, _gamma, _beta, _epsilon, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..de218d489 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +inline int32x4x4_t load_value(const int32_t *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t ret = {{ + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + }}; + return ret; +} +} // namespace + +NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor, + ITensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier) +{ + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + + return Status{}; +} + +template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); +} + +void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_output->info()->data_type()) + { + case DataType::F32: + NEMultiplyScaleFactorKernel::multiply<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEMultiplyScaleFactorKernel::multiply<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp new file mode 100644 index 000000000..ad1bb9051 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +using namespace arm_compute; +namespace +{ + +/** Conditional element-wise operations */ +enum class ConditionalOperation +{ + PRELU, /**< (x * y) for x < 0, x for x >= 0 */ +}; + +template <ConditionalOperation op, typename ScalarType> +inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case ConditionalOperation::PRELU: + res = a < 0 ? a * b : a; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b, + QuantizationInfo qinfo) +{ + return qinfo.quantize(elementwise_conditional_op_scalar<op>(a, b), RoundingPolicy::TO_NEAREST_UP); +} + +template <ConditionalOperation op, typename VectorType> +inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + VectorType const_0 = {0, 0, 0, 0}; + + switch (op) + { + case ConditionalOperation::PRELU: + res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b)); + ; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ConditionalOperation op> +inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + float32x4x4_t out = {{ + elementwise_conditional_op<op>(a.val[0], b.val[0]), + elementwise_conditional_op<op>(a.val[1], b.val[1]), + elementwise_conditional_op<op>(a.val[2], b.val[2]), + elementwise_conditional_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_conditional_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_conditional_op<op>(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x, + int window_step_x, const uint8_t *input1_ptr, + const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, + float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <ConditionalOperation op> +inline int elementwise_conditional_op_quantized_broadcast_loop( + int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af, + reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ConditionalOperation op, typename ScalarType, typename VectorType> +void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>, + &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_conditional_op_loop<op, ScalarType, VectorType>); +} + +template <ConditionalOperation op> +void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>, + &elementwise_conditional_op_quantized_broadcast_loop<op>, + &elementwise_conditional_op_quantized_loop<op>); +} +} // namespace + +NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {} + +void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info())); + + // Configure kernel window + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info()); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type()); + + Window win = calculate_max_window(valid_region); + + _input = input; + _alpha = alpha; + _output = output; + INEKernel::configure(win); +} + +void NEPReLUKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + if (_input->info()->data_type() == DataType::F32) + { + elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha, + _output, window); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output, + window); + } + else + { + ARM_COMPUTE_ERROR("Wrong Type"); + } +} + +Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha, + const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Checks performed when output is configured + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output)); + + return Status{}; +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..acf0092eb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + return Status{}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float32x4_t round(const float32x4_t &fv) +{ + const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f); + const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f); + // If value < 0, mask = -1, else mask = 0 + int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4)); + return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4)); +} + +inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv); + const int32x4_t vposend = vdupq_n_s32(max_scale); + const int32x4_t vnagend = vdupq_n_s32(-max_scale); + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#else //__aarch64__ + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#endif //__aarch64__ + }}; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} +} // namespace + +NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) +{ +} + +void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output, + ITensor *scale_factor) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), scale_factor->info())); + + _input = input; + _output = output; + _scale_factor = scale_factor; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor)); + + return Status{}; +} + +template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window; + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + const auto dim_x = _input->info()->dimension(0); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); +} + +void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::F32: + NEQuantizationSymmetricKernel::quantize<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEQuantizationSymmetricKernel::quantize<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp new file mode 100644 index 000000000..59e7d9beb --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +float32x2_t calculate_min(float32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +float32x2_t calculate_max(float32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +int32x2_t calculate_min(int32x4_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +int32x2_t calculate_max(int32x4_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline uint8x8_t calculate_min(uint8x16_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline uint8x8_t calculate_max(uint8x16_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +// Helper function to calculate the minimum value of the input vector. All the elements in the +// output vector contain the min value. +inline float16x4_t calculate_min(float16x8_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the +// output vector contain the max value. +inline float16x4_t calculate_max(float16x8_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <class F> class Reducer +{ +public: + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_1D(); + Window out_slice = out_window.first_slice_window_1D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), op); + } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); + } + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), + output->info()->dimension(1))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 1, op); + } while (in_window.slide_window_slice_2D(in_slice) && + out_window.slide_window_slice_2D(out_slice)); + } + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), + output->info()->dimension(2))); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_3D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 2, op); + } while (in_window.slide_window_slice_3D(in_slice) && + out_window.slide_window_slice_3D(out_slice)); + } + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, + const ReduceOperation op) + { + // Set in/out window + Window in_window(window); + Window out_window(window); + + in_window.set(3, Window::Dimension(0, 1, 1)); + out_window.set(3, Window::Dimension(0, 1, 1)); + + // Get first input and output slices + Window in_slice = in_window.first_slice_window_4D(); + Window out_slice = out_window.first_slice_window_4D(); + + do + { + Iterator in(input, in_slice); + Iterator out(output, out_slice); + + f(in, out, in_slice, out_slice, *input->info(), 3, op); + } while (in_window.slide_window_slice_4D(in_slice) && + out_window.slide_window_slice_4D(out_slice)); + } +}; + +template <typename T, int S> struct RedOpX +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + auto init_res_value = static_cast<T>(0.f); + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + init_res_value = *reinterpret_cast<T *>(input.ptr()); + break; + } + default: + break; + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0); + break; + } + case ReduceOperation::MAX: + { + *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } +}; + +struct RedOpX_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + ARM_COMPUTE_UNUSED(in_info); + + uint8x16_t vec_res_value = {0}; + + if (op == ReduceOperation::MIN || op == ReduceOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{}); + } + + execute_window_loop(in_slice, + [&](const Coordinates &) { + const auto vec_elements = wrapper::vloadq(input.ptr()); + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input); + + switch (op) + { + case ReduceOperation::MIN: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + break; + } + case ReduceOperation::MAX: + { + *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + } +}; + +template <typename T, int S> struct RedOpYZW +{ + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + neon_vector vec_res_value = {0}; + switch (op) + { + case ReduceOperation::MIN: + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr())); + break; + } + default: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + break; + } + } + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr; + switch (axis) + { + case 1: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim))); + break; + case 2: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim))); + break; + case 3: + in_ptr = reinterpret_cast<T *>( + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim))); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +struct RedOpYZW_qasymm8 +{ + inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, + const TensorInfo &in_info, int axis, const ReduceOperation op) + { + ARM_COMPUTE_UNUSED(out_slice); + + execute_window_loop( + in_slice, + [&](const Coordinates &) { + auto vec_res_value = wrapper::vloadq(input.ptr()); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) + { + uint8_t *in_ptr; + switch (axis) + { + case 1: + in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim)); + break; + case 2: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim)); + break; + case 3: + in_ptr = + input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim)); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + const auto vec_elements = wrapper::vloadq(in_ptr); + + switch (op) + { + case ReduceOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReduceOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value); + }, + input, output); + } +}; + +void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, + const ReduceOperation op) +{ + const bool is_complex = (input->info()->num_channels() == 2); + if (is_complex) + { + ARM_COMPUTE_ERROR("Not supported"); + } + + switch (axis) + { + case 0: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, + RedOpX<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); + case DataType::S32: + return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), + op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 1: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 2: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + case 3: + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, + RedOpYZW<float16_t, 8>(), op); +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), + op); + case DataType::S32: + return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, + RedOpYZW<int32_t, 4>(), op); + default: + ARM_COMPUTE_ERROR("Not supported"); + } + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + + if (input->num_channels() == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + DataType::F16, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); + + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(op); + + // Calculate output shape and set if empty + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + + // Output auto initialization if not yet initialized + DataType output_data_type = input->data_type(); + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + + return std::make_tuple(err, win); +} +} // namespace + +NEReductionOperationKernelEx::NEReductionOperationKernelEx() + : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX), + _border_size() +{ +} + +BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; } + +void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + unsigned int num_elems_processed_per_iteration = + 16 / data_size_from_type(input->info()->data_type()); + + _input = input; + _output = output; + _border_size = + (axis == 0) + ? BorderSize(0, num_elems_processed_per_iteration - + (input->info()->dimension(0) % num_elems_processed_per_iteration), + 0, 0) + : BorderSize(); + _op = op; + _reduction_axis = axis; + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( + validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op))); + + return Status{}; +} + +void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + reduce_op(window, _input, _output, _reduction_axis, _op); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp new file mode 100644 index 000000000..36a2f55a9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include <arm_neon.h> +#include <cstdint> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + + ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); + + // Validate output if initialized + if (output->total_size() != 0) + { + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] != + output->tensor_shape()[idx_batch]); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) != + 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != + output->tensor_shape().total_size()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} +} // namespace + +NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx() + : _input(nullptr), _output(nullptr), _block_shape() +{ +} + +void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output, + int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape)); + + _input = input; + _block_shape = block_shape; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps()); + INEKernel::configure(win); +} + +Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape)); + return Status{}; +} + +void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); + + const DataLayout data_layout = _input->info()->data_layout(); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int element_size = _input->info()->element_size(); + + const size_t channel_size = _input->info()->dimension(channel_idx); + + Window slice_out = window.first_slice_window_3D(); + + int batch_id = 0; + + // Main loop for NCHW and NHWC + if (_output->info()->data_layout() == DataLayout::NCHW) + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.z(); + const size_t in_x = + id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } + else + { + do + { + Iterator out(_output, slice_out); + execute_window_loop(slice_out, + [&](const Coordinates &id) { + const size_t channel_id = id.x(); + const size_t in_x = + id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = + id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); + ++batch_id; + } while (window.slide_window_slice_3D(slice_out)); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..94242b56b --- /dev/null +++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Error.h" + +using namespace arm_compute; + +const std::pair<unsigned int, unsigned int> +arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + const unsigned int padx = info.pad_left() + info.pad_right(); + const unsigned int pady = info.pad_top() + info.pad_bottom(); + + ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); + ARM_COMPUTE_ERROR_ON(kernel_width <= padx); + ARM_COMPUTE_ERROR_ON(kernel_height <= pady); + + // Find the transpose conv out dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right; + const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom; + + return std::make_pair<unsigned int, unsigned int>(w, h); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp new file mode 100644 index 000000000..158fe0b0c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/CLFunctionsEx.h" + +// NOTE This empty file aims to validate "CLFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp new file mode 100644 index 000000000..ae64a6edd --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLArgOperation.h" + +#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ + +CLArgOperation::CLArgOperation() +{ + // DO NOTHING +} + +void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, + ArgOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op)); + _input = input; + _output = output; + _axis = axis; + _arg_op = op; + // NOTE The argminmax_axis must have no duplication. + _num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = _num_of_kernels - 1; + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _argop_kernels = + arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels); + + TensorShape shape{input->info()->tensor_shape()}; + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(_axis[i], 1); + _interm_tensors[i].allocator()->init( + TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()) + .set_data_layout(input->info()->data_layout())); + _interm_tensors[i].allocator()->allocate(); + } + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ArgMinMax on all kernels + for (size_t i = 0; i < _num_of_kernels; i++) + { + _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op); + } +} + +Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis, + const ITensorInfo *output, ArgOperation op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - 1; + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + shape.set(axis[i], 1); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; i++) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate argminmax only on all kernels + for (size_t i = 0; i < num_of_kernels; i++) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op)); + } + + return Status{}; +} + +void CLArgOperation::run() +{ + for (size_t i = 0; i < _num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_argop_kernels[i]); + } +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..7c5fe5eda --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp new file mode 100644 index 000000000..742fc6f59 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLCast.h" + +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +using namespace arm_compute; + +void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp new file mode 100644 index 000000000..c2e4ca9ff --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h" + +#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h" + +using namespace arm_compute; + +void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..2781784ca --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..c6b166163 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h" + +using namespace arm_compute; + +void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input, + const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, + arm_compute::ICLTensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); + _cl_reshape.configure(_input, &_cl_buffer); + + _cl_fc.configure(&_cl_buffer, _weights, _biases, _output); + + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } + else + { + _cl_fc.configure(_input, _weights, _biases, _output); + } +} + +void CLFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _cl_reshape.run(); + + _cl_fc.run(); +} + +void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc.prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp new file mode 100644 index 000000000..6cad9bd2e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLGatherEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +using namespace arm_compute; + +void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return CLGatherExKernel::validate(input, indices, output, axis); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..7180e9356 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..86ea5a66d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..be35ea732 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp new file mode 100644 index 000000000..38adedd10 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPReLU.h" + +#include "arm_compute/core/CL/kernels/CLPReLUKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp new file mode 100644 index 000000000..2a34c0664 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = + TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f)); + ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate( + ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *recurrent_weights, const ICLTensor *bias, + ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = + compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, + &_add_output, ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void CLRNNLayerEx::run() +{ + prepare(); + + _memory_group.acquire(); + + _fully_connected_kernel.run(); + _gemm_state_f.run(); + CLScheduler::get().enqueue(_add_kernel); + CLScheduler::get().enqueue(_activation_kernel); + + // copy hidden out to output + CLScheduler::get().enqueue(_copy_kernel); + + _memory_group.release(); +} + +void CLRNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..13a25c901 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReduceOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + // Create temporary tensor infos + auto interm_tensors = + arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1, false); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + interm_tensors[i].set_data_layout(input->data_layout()); + interm_tensors[i].set_quantization_info(input->quantization_info()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + if (!keep_dims) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, bool keep_dims, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); + + _axis = axis; + + _input = input; + _output = output; + _keep_dims = keep_dims; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = + arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReduceOperation on all kernels + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + shape.set(*it, 1, false); + if (!keep_dims || i != (num_of_kernels - 1)) + { + _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _memory_group.manage(&_interm_tensors[i]); + } + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + if (i != 0) + { + _interm_tensors[i - 1].allocator()->allocate(); + } + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output); + _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate(); + } +} + +void CLReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp new file mode 100644 index 000000000..c03826891 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h" + +using namespace arm_compute; + +void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size, + const ICLTensor *padding_size, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>(); + k->configure(input, block_size, padding_size, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp new file mode 100644 index 000000000..0f455f96f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h" + +#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h" + +using namespace arm_compute; + +void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size) +{ + auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>(); + k->configure(input, output, block_size); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..80d50ad94 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else +#endif // Disable GPU implementation + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else +#endif + { + run_on_cpu(); + } +} + +#if 0 +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} +#endif // Disable GPU implementation + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp new file mode 100644 index 000000000..40e21671d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CPP/CPPScheduler.h" + +#include <memory> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _is_prepared(false) +{ +} + +Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + const unsigned int kernel_x = weights->dimension(idx_w); + const unsigned int kernel_y = weights->dimension(idx_h); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1, + "invalid_right must be smaller than kernel_x"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1, + "inner_border_top must be smaller than kernel_y"); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added. + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR( + CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were + // added. + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); + _scaled_output.allocator()->allocate(); +} + +void CLTransposeConvLayer::run() +{ + prepare(); + + _memory_group.acquire(); + + _scale_f.run(); + _conv_f.run(); + + _memory_group.release(); +} + +void CLTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _weights_flipped.map(true); + _original_weights->map(CLScheduler::get().queue(), true); + CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); + _weights_flipped.unmap(); + _original_weights->unmap(CLScheduler::get().queue()); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp new file mode 100644 index 000000000..0ce3e6700 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h" + +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cmath> +#include <memory> +#include <tuple> + +using namespace arm_compute; + +CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT + : _upsample(), + _output(nullptr) +{ +} + +Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output, + const BorderSize &inner_border, + const PadStrideInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + _output = output; + _upsample.configure(input, _output, inner_border, info); +} + +void CLTransposeConvLayerUpsample::run() +{ + _output->map(CLScheduler::get().queue(), true); + if (is_data_type_quantized_asymmetric(_output->info()->data_type())) + { + const uint8_t quantized_zero = _output->info()->quantization_info().offset; + std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); + } + else + { + memset(_output->buffer(), 0, _output->info()->total_size()); + } + _output->unmap(CLScheduler::get().queue()); + + CLScheduler::get().enqueue(_upsample, false); +} diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp new file mode 100644 index 000000000..f8e0ef8a6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h" + +#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info) +{ + auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>(); + k->configure(input, output, info); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp new file mode 100644 index 000000000..80fbf359d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/NEFunctionsEx.h" + +// NOTE This empty file aims to validate "NEFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp new file mode 100644 index 000000000..5ba465b61 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEArgMinMax.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEArgMinMax.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ + +template <ReductionOperation OP> +NEArgMinMaxStatic<OP>::NEArgMinMaxStatic(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernel(), _reduced_out(), _reshape() +{ +} + +template <ReductionOperation OP> +Status NEArgMinMaxStatic<OP>::validate(const ITensorInfo *input, int axis, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + + TensorShape out_shape = input->tensor_shape(); + const int input_dims = input->num_dimensions(); + int axis_local = axis; + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + ARM_COMPUTE_RETURN_ERROR_ON(axis_local > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local) > input->num_dimensions() - 1); + out_shape.remove_dimension(axis_local); + + const TensorInfo out_info = output->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +template <ReductionOperation OP> +void NEArgMinMaxStatic<OP>::configure(ITensor *input, int axis, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + int axis_local = axis; + const int input_dims = input->info()->num_dimensions(); + + // Convert negative axis + axis_local = wrap_around(axis_local, input_dims); + + // Perform reduction for axis + TensorShape intermediate_shape = input->info()->tensor_shape(); + intermediate_shape.set(axis_local, 1); + auto in = input; + + _reduced_out.allocator()->init(TensorInfo(intermediate_shape, output->info()->num_channels(), + output->info()->data_type(), + output->info()->quantization_info())); + _memory_group.manage(&_reduced_out); + _reduction_kernel.configure(in, axis_local, &_reduced_out, OP); + + // Allocate intermediate tensor + _reduced_out.allocator()->allocate(); + + // Configure reshape layer if we want to drop the dimensions + TensorShape out_shape = input->info()->tensor_shape(); + out_shape.remove_dimension(axis_local); + auto_init_if_empty(*output->info(), output->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_out, output); +} + +template <ReductionOperation OP> void NEArgMinMaxStatic<OP>::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + _reduction_kernel.run(); + _reshape.run(); +} + +// Supported Specializations +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MAX>; +template class NEArgMinMaxStatic<ReductionOperation::ARG_IDX_MIN>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp new file mode 100644 index 000000000..7c15fc453 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" +#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> + +#include "arm_compute/core/ITensor.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ + +template <BinaryLogicalOperation COP> +void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, + ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); +} + +template <BinaryLogicalOperation COP> +Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); +} + +void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) +{ + auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); +} + +Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op) +{ + return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); +} + +// Supported Specializations +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp new file mode 100644 index 000000000..f2490e4e8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NECast.h" + +#include "arm_compute/core/NEON/kernels/NECastKernel.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype) +{ + auto k = arm_compute::support::cpp14::make_unique<NECastKernel>(); + k->configure(input, output, input_subtype); + _kernel = std::move(k); +} + +Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, + SubDataType input_subtype) +{ + return NECastKernel::validate(input, output, input_subtype); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp new file mode 100644 index 000000000..db419e3a8 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp new file mode 100644 index 000000000..a95018a28 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEElementwiseUnaryLayerEx.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h" + +#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NENegLayer::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEElementwiseUnaryKernelEx>(); + k->configure(ElementWiseUnaryEx::NEG, input, output); + _kernel = std::move(k); +} +Status NENegLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx::NEG, input, output); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp new file mode 100644 index 000000000..00c3ed94f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) +{ + auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..d604fedbf --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return NETransposeKernel::validate(input, output); +} + +NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) +{ +} + +void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool _is_fc_after_conv; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, + "NEFullyConnectedHybridLayer does not support after conv"); + (void)_is_fc_after_conv; + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); + + // GEMM + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().scale); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + + _quantized_input.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + _gemmlowp_output.allocator()->allocate(); +} + +Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( + &gemmlowp_output, &scale_factor, output, weights->quantization_info().scale)); + + return Status{}; +} + +void NEFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Quantize input + NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY); + + // Accumulate biases if provided + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } +} + +void NEFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..a944f699a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + if (is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().scale, + -input.quantization_info().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, + -weights.quantization_info().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( + &input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + } + + return Status{}; +} +} // namespace + +NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), + _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), + _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), + _accumulate_biases(false), _is_quantized(false), _is_prepared(false) +{ +} + +void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info( + QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset)); + weights->info()->set_quantization_info( + QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + } +} + +void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + shape_flatten)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_kernel.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, output); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, output); +} + +void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; + + // Configure gemmlowp output + if (_is_quantized) + { + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::S32)); + } + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !_is_quantized) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + _are_weights_converted = false; + } + + ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, tmp_output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if (_is_quantized) + { + float multiplier = input->info()->quantization_info().scale * + weights->info()->quantization_info().scale / + output->info()->quantization_info().scale; + int output_multiplier; + int output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, + &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, + output_shift, output->info()->quantization_info().offset); + _gemmlowp_output.allocator()->allocate(); + } + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); + + // Validate output stage for asymmetric quantized types + if (is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( + &gemmlowp_output, biases, output)); + } + + return Status{}; +} + +void NEFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Reshape of the weights + if (!_are_weights_reshaped) + { + _reshape_weights_function.run(); + } + + // Convert weights if needed + if (!_are_weights_converted) + { + _convert_weights.run(); + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } + + // Accumulate biases if provided + if (_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else + { + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } + } +} + +void NEFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ITensor *cur_weights = _original_weights; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if (!_are_weights_converted) + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + + cur_weights->mark_as_unused(); + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..fcac3c7ae --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input, + const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, + arm_compute::ITensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ITensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _neon_reshape.configure(_input, &_neon_buffer); + input_to_use = &_neon_buffer; + } + + _neon_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && + weights->info()->data_type() == DataType::S8; + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + }(); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + if (_needs_reshape) + { + _neon_buffer.allocator()->allocate(); + } +} + +void NEFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _neon_reshape.run(); + + _neon_fc->run(); +} + +void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp new file mode 100644 index 000000000..11794a1ea --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/TensorAllocator.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), + _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), _offset_contribution_kernel(), + _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), + _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), + _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), + _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), + _fuse_output_stage(false), _run_activation(false), _flip_signedness(false) +{ +} + +void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c, + ITensor *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_UNUSED(c); + ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate( + a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info)); + + const ITensor *matrix_a = a; + const ITensor *matrix_b = b; + GEMMInfo info = gemm_info; + + // Clear state + _mtx_a_reshape_kernel = nullptr; + _mtx_b_reshape_kernel = nullptr; + + // Set internal variables + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; + _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + _is_prepared = false; + _fused_assembly_path = false; + _original_b = b; + + const ITensor *a_to_use = a; + + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + _memory_group.manage(&_mm_result_s32); + TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32); + _mm_result_s32.allocator()->init(info_mm_result_s32); + } + +#ifdef __aarch64__ +#if 0 // Can use after arm compute library v19.11 + switch (a->info()->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::U8: + case DataType::S8: + { + if (a_to_use->info()->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + _asm_glue.configure(a_to_use, b, c, output, gemm_info); + _fused_assembly_path = _asm_glue.is_configured(); + } + else + { + _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, + gemm_info); + } + _assembly_path = _asm_glue.is_configured(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } + } +#endif // 0 + ARM_COMPUTE_ERROR("aarch64 not supported"); +#endif /* __aarch64__ */ + if (!(_assembly_path || _run_vector_matrix_multiplication)) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, + a_to_use->info()->data_type(), a_to_use->info()->quantization_info()); + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / + // 16.0f) ] + TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), + b->info()->quantization_info()); + _tmp_a.allocator()->init(a_info); + _tmp_b.allocator()->init(b_info); + _memory_group.manage(&_tmp_a); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_tmp_b); + } + + // Configure interleave kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>(); + k->configure(a_to_use, &_tmp_a); + _mtx_a_reshape_kernel = std::move(k); + } + + // Configure transpose kernel + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>(); + k->configure(b, &_tmp_b); + _mtx_b_reshape_kernel = std::move(k); + } + } + + if (!_fused_assembly_path) + { + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0) + { + TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32); + + _vector_sum_col.allocator()->init(info_vector_sum_col); + if (!_reshape_b_only_on_first_run) + { + _memory_group.manage(&_vector_sum_col); + } + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32); + + _vector_sum_row.allocator()->init(info_vector_sum_row); + _memory_group.manage(&_vector_sum_row); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), + false); + } + + if (_fuse_output_stage) + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, &_mm_result_s32); + _mm_kernel = std::move(k); + } + + _offset_contribution_output_stage_kernel.configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, + _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset, + _b_offset, info.gemmlowp_output_stage()); + } + else + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>(); + k->configure(matrix_a, matrix_b, output); + _mm_kernel = std::move(k); + } + // Configure offset contribution kernel + _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, + a_to_use->info()->dimension(0), _a_offset, _b_offset); + } + } + + // Allocate tensors + if (!_assembly_path && !_run_vector_matrix_multiplication) + { + _tmp_a.allocator()->allocate(); + if (!_reshape_b_only_on_first_run) + { + _tmp_b.allocator()->allocate(); + } + } + + if (!_fused_assembly_path) + { + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + } + + if (_b_offset != 0) + { + _vector_sum_row.allocator()->allocate(); + } + } + + if (_fuse_output_stage) + { + _mm_result_s32.allocator()->allocate(); + } +} + +Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b, + const ITensorInfo *c, const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is " + "equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), + "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), + "Matrix B already reshaped is not supported"); + + GEMMInfo info = gemm_info; + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b; + + const ITensorInfo *a_to_use = a; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo mm_result_s32_info{}; + + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + if (fuse_output_stage) + { + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + } + + // Check if we need to run the optimized assembly kernel + bool run_optimised = false; + bool run_optimised_requantized = false; + const bool reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); + if (a_to_use->data_type() == DataType::QASYMM8 && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, output, 1.f, 0.f, + reshape_b_only_on_first_run)); + run_optimised_requantized = run_optimised; + } + else + { + run_optimised = bool(NEGEMMAssemblyDispatch::validate( + a_to_use, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, + reshape_b_only_on_first_run)); + } + + if (run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if (info.depth_output_gemm3d() != 0) + { + if (info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if (!run_vector_matrix_multiplication) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / + // 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width + // / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info)); + } + } + + if (!run_optimised_requantized) + { + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate( + b, &info_vector_sum_col, a->dimension(0), false)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate( + a_to_use, &info_vector_sum_row, a->dimension(0), false)); + } + + if (fuse_output_stage) + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, + info.gemmlowp_output_stage())); + } + else + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + } + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset)); + } + } + return Status{}; +} + +void NEGEMMLowpMatrixMultiplyCoreEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Reshape inputs + if (_mtx_a_reshape_kernel) + { + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } + if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + } + + // Run GEMM + if (_asm_glue.is_configured()) + { + _asm_glue.run(); + } + else + { + NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); + } + + if (!_fused_assembly_path) + { + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + if (_fuse_output_stage) + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY); + } + else + { + // Run offset contribution kernel + NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY); + } + } +} + +void NEGEMMLowpMatrixMultiplyCoreEx::prepare() +{ + if (!_is_prepared) + { + // Run assembly reshape + if (_asm_glue.is_configured() && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + _asm_glue.prepare(); + _original_b->mark_as_unused(); + } + // Run non-assembly reshape + else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run) + { + ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); + + // Run reshape kernel and mark original weights tensor as unused + _tmp_b.allocator()->allocate(); + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + _original_b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && _reshape_b_only_on_first_run) + { + _vector_sum_col.allocator()->allocate(); + NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX); + } + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp new file mode 100644 index 000000000..90dabb35a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +namespace arm_compute +{ +void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return NEGatherKernelEx::validate(input, indices, output, axis); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp new file mode 100644 index 000000000..624185d2c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) +{ + auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} + +Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..1c2c8f027 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() +{ +} + +void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma, + ITensor *beta, float epsilon) +{ + const DataLayout data_layout = input->info()->data_layout(); + + // Configure Kernels + _is_nchw = data_layout == DataLayout::NCHW; + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + + _permute_output.configure(&_permuted_output, output != nullptr ? output : input, + PermutationVector(2U, 0U, 1U)); + _permuted_input.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + _normalization_kernel.configure(input, output, gamma, beta, epsilon); + } +} + +Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return NEInstanceNormalizationLayerKernelEx::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); +} + +void NEInstanceNormalizationLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp new file mode 100644 index 000000000..1150cef76 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEPReLU.h" + +#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; + +void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>(); + k->configure(input, alpha, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp new file mode 100644 index 000000000..84411c266 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), + _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), + _gemm_output(), _add_output(), _is_prepared(false) +{ +} + +Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, const ITensorInfo *bias, + const ITensorInfo *hidden_state, const ITensorInfo *output, + const ActivationLayerInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, + output); + + const int idx_width = 0; + const int idx_height = 1; + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != + recurrent_weights->dimension(idx_width)); + ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != + recurrent_weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + hidden_state->tensor_shape()); + + auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape( + recurrent_weights, hidden_state->dimension(idx_height)), + 1, input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate( + &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info)); + + return Status{}; +} + +void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *recurrent_weights, const ITensor *bias, + ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(), + recurrent_weights->info(), bias->info(), + hidden_state->info(), output->info(), info)); + + const int idx_height = 1; + TensorShape shape = misc::shape_calculator::compute_rnn_shape( + recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + + _is_prepared = false; + + // Manage intermediate buffers and configure + _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + + // Manage intermediate buffers and configure + _memory_group.manage(&_fully_connected_out); + _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out); + + _memory_group.manage(&_gemm_output); + _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f); + + _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type())); + _memory_group.manage(&_add_output); + + _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, + ConvertPolicy::SATURATE); + + _fully_connected_out.allocator()->allocate(); + _gemm_output.allocator()->allocate(); + + _activation_kernel.configure(&_add_output, hidden_state, info); + _add_output.allocator()->allocate(); + + _copy_kernel.configure(hidden_state, output); +} + +void NERNNLayerEx::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _fully_connected_kernel.run(); + + _gemm_state_f.run(); + + NEScheduler::get().schedule(&_add_kernel, Window::DimY); + NEScheduler::get().schedule(&_activation_kernel, Window::DimY); + + // copy hidden out to output + NEScheduler::get().schedule(&_copy_kernel, Window::DimY); +} + +void NERNNLayerEx::prepare() +{ + if (!_is_prepared) + { + _fully_connected_kernel.prepare(); + _gemm_state_f.prepare(); + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp new file mode 100644 index 000000000..c65e93570 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels = + arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops); + _reduced_outs = + arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = i == 0 ? input->info()->tensor_shape() + : (_reduced_outs.get() + i - 1)->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(output->info()->data_layout())); + _memory_group.manage(_reduced_outs.get() + i); + _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i], + ReductionOperation::MEAN_SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output); + } +} + +void NEReduceMeanEx::run() +{ + _memory_group.acquire(); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } + _memory_group.release(); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp new file mode 100644 index 000000000..b36f8287a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReduceOperation op) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output, ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], op); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..3c18217ef --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp new file mode 100644 index 000000000..c3431c418 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +namespace +{ +/** Define dimension to split the window + * + * @param[in] axis Reduction axis + * + * @return The dimension to split the window + */ +size_t reduction_window_split_dimension(unsigned int axis) +{ + switch (axis) + { + case 0: + return Window::DimY; + case 1: + case 2: + case 3: + return Window::DimX; + default: + ARM_COMPUTE_ERROR("Unsupported reduction axis"); + } +} +} // namespace + +NEReductionOperationEx::NEReductionOperationEx() + : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis() +{ +} + +Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output, + unsigned int axis, ReduceOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op)); + + return Status{}; +} + +void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis, + ReduceOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + NEReductionOperationEx::validate(input->info(), output->info(), axis, op)); + + // Configure reduction kernel + _reduction_kernel.configure(input, output, axis, op); + _window_split = reduction_window_split_dimension(axis); + _reduction_axis = axis; + + if (axis == 0) + { + // Configure fill border kernel + const BorderSize fill_border_size = _reduction_kernel.border_size(); + PixelValue pixelValue; + switch (op) + { + case ReduceOperation::MIN: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(255, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + case ReduceOperation::MAX: + { + switch (input->info()->data_type()) + { + case DataType::F32: + { + pixelValue = PixelValue(-std::numeric_limits<float>::max()); + break; + } + case DataType::F16: + { + pixelValue = PixelValue(static_cast<half>(-65504.0f)); + break; + } + case DataType::QASYMM8: + { + pixelValue = + PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported DataType"); + } + } + break; + } + default: + ARM_COMPUTE_ERROR("Reduction Operation unsupported"); + } + _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue); + } +} + +void NEReductionOperationEx::run() +{ + if (_reduction_axis == 0) + { + NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_reduction_kernel, _window_split); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp new file mode 100644 index 000000000..c9f914fb0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NESpaceToBatchLayerEx::NESpaceToBatchLayerEx() + : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false) +{ +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape, + const ITensor *paddings, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape, paddings, output); +} + +void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + { + _has_padding = true; + _memset_kernel.configure( + output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info())); + } + _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, + output); +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape, + const ITensorInfo *paddings, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); + + return Status{}; +} + +Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x, + const int block_shape_y, const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate( + input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + + return Status{}; +} + +void NESpaceToBatchLayerEx::run() +{ + // Zero out output only if we have paddings + if (_has_padding) + { + NEScheduler::get().schedule(&_memset_kernel, Window::DimY); + } + NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp new file mode 100644 index 000000000..b6ae21cc0 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape) +{ + auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>(); + k->configure(input, output, block_shape); + _kernel = std::move(k); +} + +Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + int32_t block_shape) +{ + ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape)); + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp new file mode 100644 index 000000000..fd15ef05f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), + _permute_input(), + _permute_weights(), + _permute_output(), + _scaled_output(), + _weights_flipped(), + _permuted_input(), + _permuted_weights(), + _permuted_output(), + _is_nchw(false), + _original_weights(nullptr), + _input(nullptr), + _info(), + _is_prepared(false) +{ +} + +Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if (is_data_type_quantized_asymmetric(input->data_type()) && bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else if (bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(), + "Output's dim 0 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(), + "Output's dim 1 is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(), + "Output's dim 2 is invalid."); + } + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + scale_out_info.set_data_layout(input->data_layout()); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != + scale_out_info.dimension(batches_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != + scale_out_info.dimension(channel_idx)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, WeightsInfo())); + + return Status{}; +} + +void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + const DataLayout data_layout = input->info()->data_layout(); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + _is_nchw = data_layout == DataLayout::NCHW; + + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _memory_group.manage(&_scaled_output); + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_weights); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permuted_input.info()->set_quantization_info(input->info()->quantization_info()); + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + // Configure the function to transform the weights tensor from NHWC -> NCHW + _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info()); + _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U)); + _permuted_weights.info()->set_data_layout(DataLayout::NCHW); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(), + _permuted_input.info()->quantization_info()); + scale_out_info.set_data_layout(DataLayout::NCHW); + _scaled_output.allocator()->init(scale_out_info); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::CEIL); + _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(*_permuted_weights.info()->clone()); + _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info()); + _flip_weights.configure(&_permuted_weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const auto out_shape = output->info()->tensor_shape(); + TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]}; + TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(), + output->info()->quantization_info()); + _permuted_output.allocator()->init(permuted_out_info); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info); + + // Configure the function to transform the convoluted output to NHWC + _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U)); + + _permuted_input.allocator()->allocate(); + _permuted_weights.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in + // order to match output shape + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + _scaled_output.allocator()->init(scale_out_info); + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + } + _scaled_output.allocator()->allocate(); +} + +void NETransposeConvLayer::run() +{ + prepare(); + + // MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + _upsample_f.run(); + _conv_f.run(); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} + +void NETransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + // Permute weights + if (!_is_nchw) + { + _permute_weights.run(); + } + NEScheduler::get().schedule(&_flip_weights, Window::DimZ); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp new file mode 100644 index 000000000..67e1bfb02 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/GenericGather.h" + +namespace arm_compute +{ +namespace misc +{ + +bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() != 4 && output->num_dimensions() == 4 && + input->data_layout() == DataLayout::NCHW); +} + +void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis) +{ + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + arm_compute::PermutationVector pv; + if (shouldPermute(input->info(), output->info())) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H / + // C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + if (shouldPermute(input->info(), output->info())) + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis); + _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +void GenericGather::run(void) +{ + if (utils::isGpuMode()) + { + _cl_gather.run(); + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp new file mode 100644 index 000000000..8025ae28e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/GenericReshapeLayer.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/misc/functions/GenericReshapeLayer.h" + +namespace arm_compute +{ +namespace misc +{ + +namespace +{ + +bool shouldPermute(const arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() == 4 || output->num_dimensions() == 4) && + (input->num_dimensions() != output->num_dimensions() && + input->data_layout() == DataLayout::NCHW); +} + +} // namespace + +void GenericReshapeLayer::configure(const arm_compute::ITensor *input, arm_compute::ITensor *output) +{ + _input = input; + _output = output; + + arm_compute::PermutationVector pv; + if (input->info()->data_layout() == DataLayout::NCHW && input->info()->num_dimensions() == 4 && + output->info()->num_dimensions() != 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape W / H / C into another tensor of shape + // C / W / H + // + // Original | Permuted + // 0 | W | C (from 2) + // 1 | H | W (from 0) + // 2 | C | H (from 1) + // + pv = arm_compute::PermutationVector{2, 0, 1}; + } + else if (input->info()->data_layout() == DataLayout::NCHW && + input->info()->num_dimensions() != 4 && output->info()->num_dimensions() == 4) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape + // W / H / C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + const auto const_input = CAST_CL(const_cast<arm_compute::ITensor *>(input)); + if (shouldPermute(input->info(), output->info())) + { + _cl_permute.configure(const_input, &_cl_permuted, pv); + _cl_reshape.configure(&_cl_permuted, CAST_CL(output)); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_reshape.configure(const_input, CAST_CL(output)); + } + } + else + { + if (shouldPermute(input->info(), output->info())) + { + _neon_permute.configure(input, &_neon_permuted, pv); + _neon_reshape.configure(&_neon_permuted, output); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _neon_permuted.allocator()->allocate(); + } + else + { + _neon_reshape.configure(input, output); + } + } +} + +void GenericReshapeLayer::run(void) +{ + if (utils::isGpuMode()) + { + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + _cl_reshape.run(); + } + else + { + if (shouldPermute(_input->info(), _output->info())) + { + _neon_permute.run(); + } + _neon_reshape.run(); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp new file mode 100644 index 000000000..44a4bb9ed --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/misc/functions/Utils.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/Utils.h" + +namespace arm_compute +{ +namespace misc +{ +namespace utils +{ + +bool isGpuMode() +{ + char *neon = std::getenv("NEON"); + if (neon == nullptr) + return true; + else if (neon[0] == '1') + return false; + return true; +} + +} // namespace utils +} // namespace misc +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..f94effea1 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ +template <typename T> class TopContainer +{ +public: + /** + * @brief Prevent default constructor of of this class + */ + TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ + TopContainer(const TopContainer &) = delete; + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ + TopContainer &operator=(const TopContainer &) = delete; + + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt new file mode 100644 index 000000000..5ea6cdadd --- /dev/null +++ b/compute/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectories() diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt new file mode 100644 index 000000000..9ddec350b --- /dev/null +++ b/compute/cker/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library(nnfw_lib_cker INTERFACE) + +nnfw_find_package(Eigen QUIET) +option(BUILD_CKER_OPTIMIZE "Build optimize cker library" ON) + +if(Eigen_FOUND AND BUILD_CKER_OPTIMIZE) + target_link_libraries(nnfw_lib_cker INTERFACE eigen) + target_compile_definitions(nnfw_lib_cker INTERFACE CKER_OPTIMIZED_EIGEN) +endif(Eigen_FOUND AND BUILD_CKER_OPTIMIZE) + +target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) diff --git a/compute/cker/README.md b/compute/cker/README.md new file mode 100644 index 000000000..149320ffc --- /dev/null +++ b/compute/cker/README.md @@ -0,0 +1,7 @@ +# cker + +cker - Portable CPU kernel library + +__cker__ means `CPU kernel` + +Current __cker__ is porting of Tensorflow lite's reference_op kernel (Tensorflow 1.12) and gemmlow diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h new file mode 100644 index 000000000..39449c68f --- /dev/null +++ b/compute/cker/include/cker/Shape.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SHAPE_H__ +#define __NNFW_CKER_SHAPE_H__ + +#include <algorithm> +#include <cstring> +#include <cassert> +#include <vector> + +#define UNUSED_RELEASE(a) (void)(a) + +namespace nnfw +{ +namespace cker +{ + +class Shape +{ +public: + // Shapes with dimensions up to 4 are stored directly in the structure, while + // larger shapes are separately allocated. + static constexpr int kMaxSmallSize = 4; + + Shape &operator=(Shape const &) = delete; + + Shape() : _size(0) {} + + explicit Shape(int dimensions_count) : _size(dimensions_count) + { + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + Shape(int shape_size, int32_t value) : _size(0) + { + Resize(shape_size); + for (int i = 0; i < shape_size; ++i) + { + SetDim(i, value); + } + } + + Shape(int dimensions_count, const int32_t *dims_data) : _size(0) + { + ReplaceWith(dimensions_count, dims_data); + } + + Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); } + + // Avoid using this constructor. We should be able to delete it when C++17 + // rolls out. + Shape(Shape const &other) : _size(other.DimensionsCount()) + { + if (_size > kMaxSmallSize) + { + _dims_pointer = new int32_t[_size]; + } + std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size); + } + + bool operator==(const Shape &comp) const + { + return this->_size == comp._size && + std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0; + } + + ~Shape() + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + } + + inline int32_t DimensionsCount() const { return _size; } + inline int32_t Dims(int i) const + { + assert(i >= 0); + assert(i < _size); + return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i]; + } + inline void SetDim(int i, int32_t val) + { + assert(i >= 0); + assert(i < _size); + if (_size > kMaxSmallSize) + { + _dims_pointer[i] = val; + } + else + { + _dims[i] = val; + } + } + + inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + // The caller must ensure that the shape is no bigger than 4-D. + inline const int32_t *DimsDataUpTo4D() const { return _dims; } + + inline void Resize(int dimensions_count) + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + _size = dimensions_count; + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + inline void ReplaceWith(int dimensions_count, const int32_t *dims_data) + { + Resize(dimensions_count); + int32_t *dst_dims = DimsData(); + std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); + } + + template <typename T> inline void BuildFrom(const T &src_iterable) + { + const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end()); + Resize(dimensions_count); + int32_t *data = DimsData(); + for (auto it : src_iterable) + { + *data = it; + ++data; + } + } + + // This will probably be factored out. Old code made substantial use of 4-D + // shapes, and so this function is used to extend smaller shapes. Note that + // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be + // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their + // inputs should already be 4-D, so this function should not be needed. + inline static Shape ExtendedShape(int new_shape_size, const Shape &shape) + { + return Shape(new_shape_size, shape, 1); + } + + inline void BuildFrom(const std::initializer_list<int> init_list) + { + BuildFrom<const std::initializer_list<int>>(init_list); + } + + // Returns the total count of elements, that is the size when flattened into a + // vector. + inline int FlatSize() const + { + int buffer_size = 1; + const int *dims_data = DimsData(); + for (int i = 0; i < _size; i++) + { + const int dim = dims_data[i]; + assert(dim >= 1); + buffer_size *= dim; + } + return buffer_size; + } + + bool operator!=(const Shape &comp) const { return !((*this) == comp); } + +private: + // For use only by ExtendedShape(), written to guarantee (return-value) copy + // elision in C++17. + // This creates a shape padded to the desired size with the specified value. + Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0) + { + assert(new_shape_size >= shape.DimensionsCount()); + assert(new_shape_size <= kMaxSmallSize); + Resize(new_shape_size); + const int size_increase = new_shape_size - shape.DimensionsCount(); + for (int i = 0; i < size_increase; ++i) + { + SetDim(i, pad_value); + } + std::memcpy(DimsData() + size_increase, shape.DimsData(), + sizeof(int32_t) * shape.DimensionsCount()); + } + + int32_t _size; + union { + int32_t _dims[kMaxSmallSize]; + int32_t *_dims_pointer; + }; +}; + +inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2) +{ + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + assert(shape1.Dims(index1) == shape2.Dims(index2)); + return shape1.Dims(index1); +} + +inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); } + +inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3) +{ + assert(shape.DimensionsCount() == 4); + const int *dims_data = shape.DimsDataUpTo4D(); + assert(i0 >= 0 && i0 < dims_data[0]); + assert(i1 >= 0 && i1 < dims_data[1]); + assert(i2 >= 0 && i2 < dims_data[2]); + assert(i3 >= 0 && i3 < dims_data[3]); + return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3; +} + +inline int FlatSizeSkipDim(const Shape &shape, int skip_dim) +{ + const int dims_count = shape.DimensionsCount(); + assert(skip_dim >= 0 && skip_dim < dims_count); + const auto *dims_data = shape.DimsData(); + int flat_size = 1; + for (int i = 0; i < dims_count; ++i) + { + flat_size *= (i == skip_dim) ? 1 : dims_data[i]; + } + return flat_size; +} + +// Flat size calculation, checking that dimensions match with one or more other +// arrays. +inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + assert(shape.DimensionsCount() == check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + return shape.FlatSize(); +} + +inline int MatchingFlatSize(const Shape &shape, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + UNUSED_RELEASE(check_shape_0); + assert(shape.DimensionsCount() == check_shape_0.DimensionsCount()); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + return MatchingFlatSize(shape, check_shape_1); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return FlatSizeSkipDim(shape, skip_dim); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SHAPE_H__ diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h new file mode 100644 index 000000000..85654b040 --- /dev/null +++ b/compute/cker/include/cker/Types.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TYPES_H__ +#define __NNFW_CKER_TYPES_H__ + +#include <cstdint> + +namespace nnfw +{ +namespace cker +{ + +enum class FusedActivationFunctionType +{ + kNone = 0, + kRelu6 = 1, + kRelu1 = 2, + kRelu = 3, +}; +enum class PaddingType +{ + kNone = 0, + kSame = 1, + kValid = 2, +}; + +struct PaddingValues +{ + int16_t width; + int16_t height; +}; + +struct PoolParams +{ + FusedActivationFunctionType activation; + PaddingType padding_type; + PaddingValues padding_values; + int stride_height; + int stride_width; + int filter_height; + int filter_width; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct SoftmaxParams +{ + // beta is not really used (not a Tensorflow parameter) and not implemented + // for LogSoftmax. + double beta; + // uint8 inference params. Used even when beta defaults to 1.0. + int32_t input_multiplier; + int32_t input_left_shift; + // Reverse scaling is only used by LogSoftmax. + int32_t reverse_scaling_divisor; + int32_t reverse_scaling_right_shift; + int diff_min; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TYPES_H__ diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h new file mode 100644 index 000000000..d1f1723c4 --- /dev/null +++ b/compute/cker/include/cker/Utils.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UTILS_H__ +#define __NNFW_CKER_UTILS_H__ + +#include <algorithm> +#include <cstdint> + +#include "cker/gemmlowp/FixedPoint.h" +#include "Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max) +{ + return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max); +} + +inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), + right_shift); +} + +inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, + int left_shift) +{ + return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier); +} + +inline int NodeOffset(int b, int h, int w, int height, int width) +{ + return (b * height + h) * width + w; +} + +inline int CountLeadingZeros(uint32_t integer_input) +{ + const uint32_t one_in_leading_positive = 1U << 31; + int leading_zeros = 0; + while (integer_input < one_in_leading_positive) + { + integer_input <<= 1; + ++leading_zeros; + } + return leading_zeros; +} + +// Comment from tensorflow lite: +// +// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional +// rectangular array of numbers. +// +// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. +// However, as Dims<N> is to be deprecated, this class exists as an adaptor +// to enable simple unoptimized implementations of element-wise broadcasting +// operations. +template <int N> struct NdArrayDesc +{ + // The "extent" of each dimension. Indices along dimension d must be in the + // half-open interval [0, extents[d]). + int extents[N]; + + // The number of *elements* (not bytes) between consecutive indices of each + // dimension. + int strides[N]; +}; + +// Comment from tensorflow lite: +// +// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. +inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3) +{ + assert(i0 >= 0 && i0 < desc.extents[0]); + assert(i1 >= 0 && i1 < desc.extents[1]); + assert(i2 >= 0 && i2 < desc.extents[2]); + assert(i3 >= 0 && i3 < desc.extents[3]); + return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3]; +} + +template <int N> +inline void +NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, + NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out) +{ + assert(desc0_out != nullptr); + assert(desc1_out != nullptr); + + auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape); + auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape); + + // Copy dims to desc, calculating strides. + int desc0_stride = 1; + int desc1_stride = 1; + for (int i = N - 1; i >= 0; --i) + { + desc0_out->extents[i] = extended_input0_shape.Dims(i); + desc0_out->strides[i] = desc0_stride; + desc0_stride *= extended_input0_shape.Dims(i); + desc1_out->extents[i] = extended_input1_shape.Dims(i); + desc1_out->strides[i] = desc1_stride; + desc1_stride *= extended_input1_shape.Dims(i); + } + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) + { + const int extent0 = extended_input0_shape.Dims(i); + const int extent1 = extended_input1_shape.Dims(i); + if (extent0 != extent1) + { + if (extent0 == 1) + { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } + else + { + assert(extent1 == 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UTILS_H__ diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h new file mode 100644 index 000000000..645a61485 --- /dev/null +++ b/compute/cker/include/cker/eigen/Utils.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_UTILS_H__ +#define __NNFW_CKER_EIGEN_UTILS_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include <Eigen/Core> +#include <type_traits> +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen matrix expression. The same explanation as for VectorMap +// above also applies here. +template <typename Scalar> +using MatrixMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, + Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + +template <typename Scalar> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape) +{ + const int dims_count = shape.DimensionsCount(); + const int rows = shape.Dims(dims_count - 1); + const int cols = FlatSizeSkipDim(shape, dims_count - 1); + return MatrixMap<Scalar>(data, rows, cols); +} + +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_EIGEN_UTILS_H__ diff --git a/compute/cker/include/cker/gemmlowp/FixedPoint.h b/compute/cker/include/cker/gemmlowp/FixedPoint.h new file mode 100644 index 000000000..159e01a22 --- /dev/null +++ b/compute/cker/include/cker/gemmlowp/FixedPoint.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The Gemmlowp Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ +#define __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ + +#include <algorithm> +#include <cassert> + +namespace nnfw +{ +namespace cker +{ +namespace gemmlowp +{ + +inline int32_t RoundingHalfSum(int32_t a, int32_t b) +{ + int64_t a64 = a; + int64_t b64 = b; + int64_t sum = a64 + b64; + int64_t sign = sum >= 0 ? 1 : -1; + return static_cast<int32_t>((sum + sign) / 2); +} + +inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) +{ + bool overflow = a == b && a == std::numeric_limits<int32_t>::min(); + int64_t a_64(a); + int64_t b_64(b); + int64_t ab_64 = a_64 * b_64; + int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30)); + int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31)); + return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32; +} + +// Correctly-rounded-to-nearest division by a power-of-two. +// Also known as a rounding arithmetic right shift. +inline int32_t RoundingDivideByPOT(int32_t x, int exponent) +{ + assert(exponent >= 0); + assert(exponent <= 31); + const int32_t mask = ((1ll << exponent) - 1); + const int32_t zero = 0; + const int32_t one = 1; + const int32_t remainder = x & mask; + const int32_t threshold = (mask >> 1) + ((x < zero) ? one : zero); + return ((x >> exponent) + ((remainder > threshold) ? one : zero)); +} + +// Returns the product of a run-time integer value by a compile-time power +// of two, with either a positive exponent (equivalent to an arithmetic +// left shift, saturating) or a negative exponent (equivalent to an arithmetic +// right shift, rounding to nearest). +template <int Exponent, int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0)> +struct ImplSaturatingRoundingMultiplyByPOT +{ +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 0> +{ + static int32_t eval(int32_t x) { return x; } +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, 1> +{ + static int32_t eval(int32_t x) + { + const int32_t min = (std::numeric_limits<int32_t>::min()); + const int32_t max = (std::numeric_limits<int32_t>::max()); + const int32_t threshold = ((1 << (31 - Exponent)) - 1); + const int32_t zero = 0; + const int32_t one = 1; + + const int32_t positive_mask = ((x > threshold) ? ~zero : zero); + const int32_t negative_mask = ((x < -threshold) ? ~zero : zero); + + int32_t result = (x * (one << Exponent)); + result = (positive_mask ? max : result); + result = (negative_mask ? min : result); + return result; + } +}; + +template <int Exponent> struct ImplSaturatingRoundingMultiplyByPOT<Exponent, -1> +{ + static int32_t eval(int32_t x) { return RoundingDivideByPOT(x, -Exponent); } +}; + +template <int Exponent> int32_t SaturatingRoundingMultiplyByPOT(int32_t x) +{ + return ImplSaturatingRoundingMultiplyByPOT<Exponent>::eval(x); +} + +template <int tIntegerBits> class FixedPoint +{ +public: + static constexpr int kTotalBits = 8 * sizeof(int32_t); + static constexpr int kIntegerBits = tIntegerBits; + static constexpr int kFractionalBits = kTotalBits - 1 - kIntegerBits; + static_assert(kIntegerBits >= 0 && kIntegerBits < kTotalBits, "bad IntegerBits"); + + static int32_t ScalarRawMax() { return std::numeric_limits<int32_t>::max(); } + + static FixedPoint FromRaw(int32_t x) + { + FixedPoint retval; + retval.raw() = x; + return retval; + } + + static FixedPoint FromScalarRaw(int32_t x) { return FromRaw(x); } + + template <int Exponent> static FixedPoint ConstantPOT() + { + static constexpr int kOffset = kFractionalBits + Exponent; + static_assert(kOffset < 31, "Constant not exactly representable in this fixed-point format"); + return FromScalarRaw((int32_t)1 << kOffset); + } + + static FixedPoint Zero() { return FromScalarRaw(0); } + + static FixedPoint One() + { + return FromScalarRaw(kIntegerBits == 0 ? ScalarRawMax() : ((int32_t)1 << kFractionalBits)); + } + + int32_t raw() const { return i_; } + int32_t &raw() { return i_; } + +private: + int32_t i_; +}; + +// A FixedPoint multiplication is just a +// SaturatingRoundingDoublingHighMul operation on the underlying +// raw integer values. The IntegerBits simply add up, as is obvious +// from the fact that the range is [-2^IntegerBits, 2^IntegerBits). +template <int tIntegerBits_a, int tIntegerBits_b> +FixedPoint<tIntegerBits_a + tIntegerBits_b> operator*(FixedPoint<tIntegerBits_a> a, + FixedPoint<tIntegerBits_b> b) +{ + FixedPoint<tIntegerBits_a + tIntegerBits_b> c; + c.raw() = SaturatingRoundingDoublingHighMul(a.raw(), b.raw()); + return c; +} + +// Tweaking IntegerBits gives exact multiplication by a power of two. +template <int tExponent, int tIntegerBits> +FixedPoint<tExponent + tIntegerBits> ExactMulByPot(FixedPoint<tIntegerBits> a) +{ + FixedPoint<tExponent + tIntegerBits> c; + c.raw() = a.raw(); + return c; +} + +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator+(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() + b.raw())); +} +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator-(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() - b.raw())); +} +template <int tIntegerBits> +FixedPoint<tIntegerBits> operator&(FixedPoint<tIntegerBits> a, FixedPoint<tIntegerBits> b) +{ + return FixedPoint<tIntegerBits>::FromRaw((a.raw() & b.raw())); +} + +// Rescale changes the number of IntegerBits and updates the underlying +// raw integer value accordingly. +template <int tIntegerBitsDst, int tIntegerBitsSrc> +FixedPoint<tIntegerBitsDst> Rescale(FixedPoint<tIntegerBitsSrc> x) +{ + static constexpr int kExponent = tIntegerBitsSrc - tIntegerBitsDst; + FixedPoint<tIntegerBitsDst> result; + result.raw() = SaturatingRoundingMultiplyByPOT<kExponent>(x.raw()); + return result; +} + +// Implementation of exponential function. + +// Returns exp(x) for x in [-1/4, 0). +inline FixedPoint<0> exp_on_interval_between_negative_one_quarter_and_0_excl(FixedPoint<0> a) +{ + typedef FixedPoint<0> F; + const F constant_term = F::FromScalarRaw(RoundingDivideByPOT(1895147668, 0)); + const F constant_1_over_3 = F::FromScalarRaw(RoundingDivideByPOT(715827883, 0)); + // We're evaluating a Taylor expansion around -1/8, so we do the change of + // variable: x = a + 1/8. + // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28. + F x = a + F::template ConstantPOT<-3>(); + F x2 = x * x; + F x3 = x2 * x; + F x4 = x2 * x2; + F x4_over_4 = F::FromScalarRaw(SaturatingRoundingMultiplyByPOT<-2>(x4.raw())); + F x4_over_24_plus_x3_over_6_plus_x2_over_2 = F::FromScalarRaw( + SaturatingRoundingMultiplyByPOT<-1>((((x4_over_4 + x3) * constant_1_over_3) + x2).raw())); + return (constant_term + constant_term * (x + x4_over_24_plus_x3_over_6_plus_x2_over_2)); +} + +// Returns exp(x) for x < 0. +template <int tIntegerBits> FixedPoint<0> exp_on_negative_values(FixedPoint<tIntegerBits> a) +{ + typedef FixedPoint<tIntegerBits> InputF; + typedef FixedPoint<0> ResultF; + static constexpr int kFractionalBits = InputF::kFractionalBits; + static constexpr int kIntegerBits = InputF::kIntegerBits; + const InputF kOneQuarter = InputF::template ConstantPOT<-2>(); + InputF mask = kOneQuarter - InputF::FromScalarRaw(1); + InputF a_mod_quarter_minus_one_quarter = (a & mask) - kOneQuarter; + ResultF result = exp_on_interval_between_negative_one_quarter_and_0_excl( + Rescale<0>(a_mod_quarter_minus_one_quarter)); + int32_t remainder = (a_mod_quarter_minus_one_quarter - a).raw(); + +#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier) \ + if (kIntegerBits > Exponent) \ + { \ + const ResultF kMultiplier = \ + ResultF::FromScalarRaw(RoundingDivideByPOT(FixedPointMultiplier, 0)); \ + static constexpr int kShiftAmount = \ + ((kIntegerBits > Exponent) ? (kFractionalBits + Exponent) : 0); \ + result = ((remainder & (1 << kShiftAmount)) ? (result * kMultiplier) : result); \ + } + + GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947); + GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674); + GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084); + GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308); + GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535); + GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401); + GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242); + +#undef GEMMLOWP_EXP_BARREL_SHIFTER + + static constexpr int clampB = ((kIntegerBits > 5) ? (36 - kIntegerBits) : 0); + if (kIntegerBits > 5) + { + const InputF clamp = InputF::FromScalarRaw(RoundingDivideByPOT(-(1 << clampB), 0)); + result.raw() = ((a.raw() < clamp.raw()) ? ResultF::Zero().raw() : result.raw()); + } + + result.raw() = (a.raw() ? result.raw() : ResultF::One().raw()); + return result; +} + +// Returns 1 / (1 + x) for x in (0, 1). +inline FixedPoint<0> one_over_one_plus_x_for_x_in_0_1(FixedPoint<0> a) +{ + typedef FixedPoint<0> F0; + typedef FixedPoint<2> F2; + F0 half_denominator = F0::FromScalarRaw(RoundingHalfSum(a.raw(), F0::One().raw())); + // Newton-Raphson division + // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division + // Refer to that page for the logic behind the 48/17 and 32/17 constants. + const F2 constant_48_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(1515870810, 0)); + const F2 constant_neg_32_over_17 = F2::FromScalarRaw(RoundingDivideByPOT(-1010580540, 0)); + F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17; + for (int i = 0; i < 3; i++) + { + F2 half_denominator_times_x = half_denominator * x; + F2 one_minus_half_denominator_times_x = F2::One() - half_denominator_times_x; + x = x + Rescale<2>(x * one_minus_half_denominator_times_x); + } + return Rescale<0>(ExactMulByPot<-1>(x)); +} + +} // namespace gemmlowp +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GEMMLOWP_FIXED_POINT_H__ diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h new file mode 100644 index 000000000..b20919429 --- /dev/null +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_AVERAGE_POOL_H__ +#define __NNFW_CKER_AVERAGE_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) +#include "cker/operation/optimized/AveragePool.h" +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/operation/reference/AveragePool.h" + +namespace nnfw +{ +namespace cker +{ + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::AveragePool(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::AveragePool(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start); + if (filter_count <= 0) + { + continue; + } + for (int channel = 0; channel < depth; ++channel) + { + int32_t acc = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + acc += input_data[Offset(input_shape, batch, in_y, in_x, channel)]; + } + } + acc = (acc + filter_count / 2) / filter_count; + acc = std::max(acc, params.quantized_activation_min); + acc = std::min(acc, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h new file mode 100644 index 000000000..60dd02651 --- /dev/null +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ +#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ + +#include <functional> +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct BinaryArithmeticOpParam +{ + // Shape dependent / common to data / op types. + // BroadcastableOpCategory broadcast_category; + // uint8 inference params. + int32_t input1_offset; + int32_t input2_offset; + int32_t output_offset; + int32_t output_multiplier; + int32_t output_shift; + // Add / Sub, not Mul, uint8 inference params. + int32_t left_shift; + int32_t input1_multiplier; + int32_t input1_shift; + int32_t input2_multiplier; + int32_t input2_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + + // Processed output dimensions. + // Let input "a" be the one that broadcasts in the faster-changing dimension. + // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and + // {b0, b1, b2, b3, b4}, + // broadcast_shape[4] = b0 = a0. + // broadcast_shape[3] = b1; a1 = 1. + // broadcast_shape[2] = b2 = a2. + // broadcast_shape[1] = a3; b3 = 1. + // broadcast_shape[0] = b4 = a4. + // int broadcast_shape[5]; +}; + +template <typename T> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + const int32_t flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.quantized_activation_min, + params.quantized_activation_max); + } +} + +template <> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data, + const std::function<float(const float &, const float &)> &fn) +{ + const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = + ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.float_activation_min, params.float_activation_max); + } +} + +template <typename T> +inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶ms, + const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + // Comment from tensorflow lite: + // + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); + } + } + } + } +} + +template <> +inline void BroadcastBinaryArithmeticOpSlow( + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, const Shape &output_shape, + float *output_data, const std::function<float(const float &, const float &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.float_activation_min, params.float_activation_max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h new file mode 100644 index 000000000..69a179c8c --- /dev/null +++ b/compute/cker/include/cker/operation/Concatenation.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONCATENATION_H__ +#define __NNFW_CKER_CONCATENATION_H__ + +#include <cstdint> + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +struct ConcatenationParams +{ + int8_t axis; + const int32_t *input_zeropoint; + const float *input_scale; + uint16_t inputs_count; + int32_t output_zeropoint; + float output_scale; +}; + +template <typename Scalar> +inline void Concatenation(const ConcatenationParams ¶ms, const Shape *const *input_shapes, + const Scalar *const *input_data, const Shape &output_shape, + Scalar *output_data) +{ + int axis = params.axis; + int inputs_count = params.inputs_count; + const int concat_dimensions = output_shape.DimensionsCount(); + assert(axis < concat_dimensions); + + int64_t concat_size = 0; + for (int i = 0; i < inputs_count; i++) + { + assert(input_shapes[i]->DimensionsCount() == concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) + { + if (j != axis) + { + auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j); + UNUSED_RELEASE(dim_checked); + } + } + concat_size += input_shapes[i]->Dims(axis); + } + assert(concat_size == output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= output_shape.Dims(i); + } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) + { + base_inner_size *= output_shape.Dims(i); + } + + Scalar *output_ptr = output_data; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < inputs_count; ++i) + { + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar)); + output_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H__ diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h new file mode 100644 index 000000000..35b0336fa --- /dev/null +++ b/compute/cker/include/cker/operation/Conv.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONV_H__ +#define __NNFW_CKER_CONV_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct ConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + const int in_offset = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + float input_value = input_data[in_offset + in_channel]; + float filter_value = filter_data[filter_offset + in_channel]; + total += (input_value * filter_value); + } + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + ActivationFunctionWithMinMax(total + bias_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(output_activation_min <= output_activation_max); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + const int in_base = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + for (int in_channel = 0; in_channel < input_depth; in_channel++) + { + int32_t input_val = input_data[in_channel + in_base]; + int32_t filter_val = filter_data[in_channel + filter_base]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + } + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H_ diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h new file mode 100644 index 000000000..7d022477d --- /dev/null +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__ +#define __NNFW_CKER_DEPTHWISE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct DepthwiseConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + int16_t depth_multiplier; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + assert(output_activation_min <= output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int ic = 0; ic < input_depth; ++ic) + { + for (int m = 0; m < depth_multiplier; m++) + { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + int32_t input_val = input_data[Offset(input_shape, b, in_y, in_x, ic)]; + int32_t filter_val = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + if (bias_data) + { + acc += bias_data[oc]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, b, out_y, out_x, oc)] = static_cast<uint8_t>(acc); + } + } + } + } + } +} + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int ic = 0; ic < input_depth; ++ic) + { + for (int m = 0; m < depth_multiplier; m++) + { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)]; + float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; + total += (input_value * filter_value); + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[oc]; + } + output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_DEPTHWISE_CONV_H__ diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h new file mode 100644 index 000000000..428fb1b53 --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_H__ +#define __NNFW_CKER_FULLY_CONNECTED_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct FullyConnectedParams +{ + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + // FullyConnectedWeightsFormat weights_format; +}; + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + UNUSED_RELEASE(input_shape); + UNUSED_RELEASE(bias_shape); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) + { + total += input_data[b * accum_depth + d] * weights_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_c]; + } + output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } +} + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data) +{ + UNUSED_RELEASE(input_shape); + UNUSED_RELEASE(bias_shape); + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(filter_shape.DimensionsCount() >= 2); + assert(output_shape.DimensionsCount() >= 1); + + assert(output_activation_min <= output_activation_max); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = + MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + int32_t acc = 0; + for (int d = 0; d < accum_depth; ++d) + { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) + { + acc += bias_data[out_c]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FULLY_CONNECTED_H__ diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h new file mode 100644 index 000000000..9cd96eeb7 --- /dev/null +++ b/compute/cker/include/cker/operation/Gather.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GATHER_H__ +#define __NNFW_CKER_GATHER_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct GatherParams +{ + int32_t axis; +}; + +template <typename T, typename CoordsT = int32_t> +inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data, + const Shape &coords_shape, const CoordsT *coords_data, const Shape &, + T *output_data) +{ + int axis = op_params.axis; + if (axis < 0) + { + axis += input_shape.DimensionsCount(); + } + assert(axis >= 0); + assert(axis < input_shape.DimensionsCount()); + const int axis_size = input_shape.Dims(axis); + const int coords_count = coords_shape.FlatSize(); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int inner_size = 1; + for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) + { + inner_size *= input_shape.Dims(i); + } + + for (int outer = 0; outer < outer_size; ++outer) + { + for (int i = 0; i < coords_count; ++i) + { + assert(coords_data[i] >= 0); + assert(coords_data[i] < axis_size); + std::memcpy(output_data + (outer * coords_count + i) * inner_size, + input_data + (outer * axis_size + coords_data[i]) * inner_size, + sizeof(T) * inner_size); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GATHER_H__ diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h new file mode 100644 index 000000000..794dcebc8 --- /dev/null +++ b/compute/cker/include/cker/operation/InstanceNorm.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_INSTANCE_NORM_H__ +#define __NNFW_CKER_INSTANCE_NORM_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +struct InstanceNormParams +{ + float epsilon; + float float_activation_min; + float float_activation_max; +}; + +inline void InstanceNorm(const InstanceNormParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &gamma_shape, const float *gamma_data, + const Shape &beta_shape, const float *beta_data, const Shape &output_shape, + float *output_data) +{ + const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1); + const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2); + const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + UNUSED_RELEASE(gamma_shape); + UNUSED_RELEASE(beta_shape); + assert(output_activation_min <= output_activation_max); + + for (int32_t batch = 0; batch < batches; batch++) + { + for (int32_t channel = 0; channel < channels; channel++) + { + double sum = 0.0f; + double square_sum = 0.0f; + int32_t size = heights * widths; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_val = input_data[Offset(input_shape, batch, height, width, channel)]; + sum += input_val; + square_sum += (input_val * input_val); + } + } + + double mean = sum / size; + double var = square_sum / size - mean * mean; + + double gamma = gamma_data[channel]; + double beta = beta_data[channel]; + + double a = gamma / (std::sqrt(var + params.epsilon)); + double b = -mean * a + beta; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_value = input_data[Offset(output_shape, batch, height, width, channel)]; + double output_value = input_value * a + b; + output_data[Offset(output_shape, batch, height, width, channel)] = + ActivationFunctionWithMinMax((float)output_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_INSTANCE_NORM_H__ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h new file mode 100644 index 000000000..872095531 --- /dev/null +++ b/compute/cker/include/cker/operation/Logistic.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGISTIC_H__ +#define __NNFW_CKER_LOGISTIC_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2) + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = 1.f / (1.f + std::exp(-input_data[i])); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGISTIC_H__ diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h new file mode 100644 index 000000000..326168b99 --- /dev/null +++ b/compute/cker/include/cker/operation/MaxPool.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_MAX_POOL_H__ +#define __NNFW_CKER_MAX_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include "cker/operation/optimized/MaxPool.h" +#include "cker/operation/reference/MaxPool.h" + +namespace nnfw +{ +namespace cker +{ + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::MaxPool(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::MaxPool(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(params.quantized_activation_min >= 0); + assert(params.quantized_activation_max <= 255); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + uint8_t max = 0; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); + } + } + max = std::max<uint8_t>(max, params.quantized_activation_min); + max = std::min<uint8_t>(max, params.quantized_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + static_cast<uint8_t>(max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h new file mode 100644 index 000000000..af432f3a8 --- /dev/null +++ b/compute/cker/include/cker/operation/Pad.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_PAD_H__ +#define __NNFW_CKER_PAD_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <stdexcept> +#include <iostream> +namespace nnfw +{ +namespace cker +{ +inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data, + const float *constant_value_data) +{ + // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` + // TODO: come up with more subtle solution that uses subtensors like arm compute + // TODO: Check if it works for all layouts + + using PaddingInfo = std::pair<int32_t, int32_t>; + /** List of padding information */ + using PaddingList = std::vector<PaddingInfo>; + + auto constant_value = constant_value_data ? *constant_value_data : 0; + assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); + + PaddingList padding_list(pad_rank); + for (int32_t n = 0; n < pad_rank; ++n) + { + const int32_t *from = padding_data + (n * 2); + padding_list[n] = {from[0], from[1]}; + } + for (int32_t i = 0; i < pad_rank; ++i) + { + assert(output_shape.Dims(i) == + input_shape.Dims(i) + padding_list[i].first + padding_list[i].second); + } + /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker + functions: + 1. to prevent access violation in padding_list; + 2. handling as 4d is slower than as 2d/3d. + */ + switch (pad_rank) + { + case 0: + case 1: + { + const int32_t in_row_len = input_shape.Dims(0); + std::fill_n(output_data, padding_list[0].first, constant_value); + std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float)); + std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, + constant_value); + break; + } + case 2: // HW + { + const int32_t in_row_len = input_shape.Dims(1); + const int32_t out_row_size = output_shape.Dims(1); + + // prepend padding rows + std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j) + { + auto out_offset = i * out_row_size; + const auto in_offset = j * in_row_len; + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[1].first, constant_value); + + out_offset += padding_list[1].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[1].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size, + constant_value); + break; + } + case 3: // HWC + { + const int32_t in_row_len = input_shape.Dims(2); + const int32_t out_row_size = output_shape.Dims(2); + const auto plain_size = out_row_size * output_shape.Dims(1); + + // prepend padding plains + std::fill_n(output_data, padding_list[0].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp) + { + const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp) + { + auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2); + const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2); + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[2].first, constant_value); + + out_offset += padding_list[2].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[2].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[1].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size, + constant_value); + break; + } + case 4: + { + auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t { + return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3); + }; + const int32_t in_row_len = input_shape.Dims(3); + const int32_t out_row_size = output_shape.Dims(3); + const auto plain_size = out_row_size * output_shape.Dims(2); + const auto parallelepiped_size = plain_size * output_shape.Dims(1); + + // prepend padding parallelepipeds + std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value); + + const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp) + { + const auto out_h_offset = get_offset(output_shape, i, 0, 0); + // prepend padding plains + std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp) + { + const auto out_w_offset = get_offset(output_shape, i, j, 0); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first; + for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp) + { + auto out_c_offset = get_offset(output_shape, i, j, k); + const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp); + + // prepend padding values + std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value); + + out_c_offset += padding_list[3].first; + + // copy a row of input data + memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float)); + + out_c_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[2].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size, + padding_list[1].second * plain_size, constant_value); + } + // append padding parallelepipeds + std::fill_n(output_data + r_b_inp_lim * parallelepiped_size, + padding_list[0].second * parallelepiped_size, constant_value); + break; + } + default: + throw std::runtime_error("Padding for rank > 4 NYI"); + break; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_PAD_H__ diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h new file mode 100644 index 000000000..ea404a002 --- /dev/null +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SOFTMAX_H__ +#define __NNFW_CKER_SOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" +#include "cker/gemmlowp/FixedPoint.h" +#include "cker/operation/optimized/SoftMax.h" +#include "cker/operation/reference/SoftMax.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ +#if defined(CKER_OPTIMIZED_EIGEN) + optimized::Softmax(params, input_shape, input_data, output_shape, output_data); +#else // defined(CKER_OPTIMIZED_EIGEN) + reference::Softmax(params, input_shape, input_data, output_shape, output_data); +#endif // defined(CKER_OPTIMIZED_EIGEN) +} + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + const int32_t input_beta_multiplier = params.input_multiplier; + const int32_t input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = gemmlowp::FixedPoint<kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<0>; + + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + uint8_t max_in_row = 0; + for (int c = 0; c < depth; ++c) + { + max_in_row = std::max(max_in_row, input_data[i * depth + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32_t fixed_sum_of_exps = sum_of_exps.raw(); + int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32_t shifted_sum_minus_one = + static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32_t>(1) << 31)); + + FixedPoint0 shifted_scale = + one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), + num_bits_over_unit + 31 - 8); + + output_data[i * depth + c] = static_cast<uint8_t>( + std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); + } + else + { + output_data[i * depth + c] = 0; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h new file mode 100644 index 000000000..535fe86cf --- /dev/null +++ b/compute/cker/include/cker/operation/TransposeConv.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__ +#define __NNFW_CKER_TRANSPOSE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct TransposeConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +inline void TransposeConv(const TransposeConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &output_shape, float *output_data) +{ + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // Although transpose convolution simplifies to convolution with transposed + // weights for strides of 1, non-unitary striding complicates matters. To + // keep this reference implementation as clear as possible, we use a + // "scatter" access pattern, where we loop through all the input elements, + // computing their influence on the output, rather than looping through the + // output elements in the typical "gather" access pattern of a conv. We + // therefore must initialize the output array to zero. + const int num_elements = output_shape.FlatSize(); + for (int i = 0; i < num_elements; i++) + { + output_data[i] = 0.0f; + } + + // Loop through input elements one at a time. + for (int batch = 0; batch < batches; ++batch) + { + for (int in_y = 0; in_y < input_height; ++in_y) + { + for (int in_x = 0; in_x < input_width; ++in_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + // Loop through the output elements it will influence + const int out_x_origin = (in_x * stride_width) - pad_width; + const int out_y_origin = (in_y * stride_height) - pad_height; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + // Compute output element location + const int out_x = out_x_origin + filter_x; + const int out_y = out_y_origin + filter_y; + // We cannot accumulate out of bounds + if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && + (out_y < output_height)) + { + float input_value = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] += + input_value * filter_value; + } + } + } + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRANSPOSE_CONV_H__ diff --git a/compute/cker/include/cker/operation/optimized/AveragePool.h b/compute/cker/include/cker/operation/optimized/AveragePool.h new file mode 100644 index 000000000..d94a5811a --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/AveragePool.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ +#define __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// TODO Change to apply neon for this function if it is faster +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + // TODO(benoitjacob) make this a proper reference impl without Eigen! + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // TODO(benoitjacob) get rid of the dynamic memory allocation here! + Eigen::VectorXf out_count(out_mat.cols()); + out_count.setZero(); + // Prefill the output to 0. + out_mat.setZero(); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width)); + out_count(out_offset)++; + } + } + } + } + } + // Divide the output by the actual number of elements being averaged over + assert(out_count.minCoeff() > 0); + out_mat.array().rowwise() /= out_count.transpose().array(); + + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/optimized/MaxPool.h b/compute/cker/include/cker/operation/optimized/MaxPool.h new file mode 100644 index 000000000..07a14aee4 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/MaxPool.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ +#define __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// TODO Change to apply neon for this function if it is faster +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Prefill the output to minimum representable float value + out_mat.setConstant(std::numeric_limits<float>::lowest()); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) = + out_mat.col(out_offset) + .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); + } + } + } + } + } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/optimized/SoftMax.h b/compute/cker/include/cker/operation/optimized/SoftMax.h new file mode 100644 index 000000000..e44f251d0 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/SoftMax.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ +#define __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ + +#if defined(CKER_OPTIMIZED_EIGEN) + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + // Validate whether if shapes of input and output are the same + MatchingFlatSize(input_shape, output_shape); + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Compute the exponential first, removing the max coefficient for numerical + // stability. + out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta; + // We are separating out the exp function so that exp can be vectorized. + out_mat = out_mat.array().exp(); + // Normalize to get the activations. + Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse(); + out_mat.array().rowwise() *= scale; +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_OPTIMIZED_SOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/reference/AveragePool.h b/compute/cker/include/cker/operation/reference/AveragePool.h new file mode 100644 index 000000000..3ddab4b24 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/AveragePool.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ +#define __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void AveragePool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + int filter_count = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start); + if (filter_count <= 0) + { + continue; + } + for (int channel = 0; channel < depth; ++channel) + { + float total = 0.f; + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + total += input_data[Offset(input_shape, batch, in_y, in_x, channel)]; + } + } + const float average = total / (float)filter_count; + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(average, params.float_activation_min, + params.float_activation_max); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/reference/MaxPool.h b/compute/cker/include/cker/operation/reference/MaxPool.h new file mode 100644 index 000000000..a0f0263c7 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/MaxPool.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_MAX_POOL_H__ +#define __NNFW_CKER_REFERENCE_MAX_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void MaxPool(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + float max = std::numeric_limits<float>::lowest(); + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + max = std::max(max, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); + } + } + output_data[Offset(output_shape, batch, out_y, out_x, channel)] = + ActivationFunctionWithMinMax(max, params.float_activation_min, + params.float_activation_max); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/reference/SoftMax.h b/compute/cker/include/cker/operation/reference/SoftMax.h new file mode 100644 index 000000000..420cb319b --- /dev/null +++ b/compute/cker/include/cker/operation/reference/SoftMax.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_SOFTMAX_H__ +#define __NNFW_CKER_REFERENCE_SOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) + { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) + { + sum += std::exp((input_data[i * depth + c] - max) * params.beta); + } + + // Compute result. + for (int c = 0; c < depth; ++c) + { + output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) * params.beta) / sum; + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_SOFTMAX_H__ diff --git a/compute/ncnn/CMakeLists.txt b/compute/ncnn/CMakeLists.txt new file mode 100644 index 000000000..a8f50120f --- /dev/null +++ b/compute/ncnn/CMakeLists.txt @@ -0,0 +1,34 @@ +if(NOT BUILD_SRCN_KERNEL) + message(STATUS "SRCN kernel library build: disabled") + return() +else(NOT BUILD_SRCN_KERNEL) + message(STATUS "SRCN kernel library build: OK") +endif() + +# Find and use pre-installed OpenMP +find_package(OpenMP QUIET) +if(NOT OpenMP_FOUND) + return() +endif(NOT OpenMP_FOUND) + +file(GLOB_RECURSE SOURCES src/*.cc) +file(GLOB_RECURSE TESTS src/*_test.cc) +list(REMOVE_ITEM SOURCES ${TESTS}) + +add_library(nnfw_lib_srcn STATIC ${SOURCES}) +target_include_directories(nnfw_lib_srcn PUBLIC include) +if(NOT TARGET OpenMP::OpenMP_CXX) + find_package(Threads REQUIRED) + add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE) + set_property(TARGET OpenMP::OpenMP_CXX + PROPERTY INTERFACE_COMPILE_OPTIONS ${OpenMP_CXX_FLAGS}) + # Only works if the same flag is passed to the linker; use CMake 3.9+ otherwise (Intel, AppleClang) + set_property(TARGET OpenMP::OpenMP_CXX + PROPERTY INTERFACE_LINK_LIBRARIES ${OpenMP_CXX_FLAGS} Threads::Threads) + +endif() +target_link_libraries(nnfw_lib_srcn PRIVATE OpenMP::OpenMP_CXX) +target_link_libraries(nnfw_lib_srcn PRIVATE nnfw_common) +target_compile_definitions(nnfw_lib_srcn PRIVATE TIZEN) # ANDROID or TIZEN +#target_compile_definitions(nnfw_lib_srcn PRIVATE NCNN) # Enable if ready +set_target_properties(nnfw_lib_srcn PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/compute/ncnn/README.md b/compute/ncnn/README.md new file mode 100644 index 000000000..5c39d249a --- /dev/null +++ b/compute/ncnn/README.md @@ -0,0 +1,9 @@ +### NCNN compute library + +This compute library is based on NCNN project (https://github.com/Tencent/ncnn) with custom optimization + +Current base commit: https://github.com/Tencent/ncnn/commit/0219f507b71bdb945d776c8586c162f2c22bba54 + +Added files for custom optimization is placed on +- Headers: include/ncnn/srcn +- Soruces: src/srcn diff --git a/compute/ncnn/include/ncnn/layer/binaryop.h b/compute/ncnn/include/ncnn/layer/binaryop.h new file mode 100644 index 000000000..4ccfd94b4 --- /dev/null +++ b/compute/ncnn/include/ncnn/layer/binaryop.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef __NCNN_LAYER_BINARYOP_H__ +#define __NCNN_LAYER_BINARYOP_H__ + +#include "ncnn/mat.h" + +namespace nnfw +{ +namespace ncnn +{ + +enum class BinaryOp +{ + Operation_ADD = 0, + Operation_SUB = 1, + Operation_MUL = 2, + Operation_DIV = 3, + Operation_MAX = 4, + Operation_MIN = 5, + Operation_POW = 6, + Operation_SQUAREDDIFFERENCE = 7 +}; + +struct BinaryOpParam +{ + BinaryOp op_type; + float b; + + BinaryOpParam() : op_type{BinaryOp::Operation_ADD}, b{0.0f} {} +}; + +int ncnn_binary_op(const BinaryOpParam ¶m, const Mat &bottom_blob, const Mat &bottom_blob1, + Mat &top_blob); +// TODO Inplace function porting +// int ncnn_binary_op_inplace(const BinaryParam ¶m, Mat &bottom_top_blob) const; +// int ncnn_binary_op_inplace(const BinaryOpParam ¶m, std::vector<Mat> &bottom_top_blobs) const; + +} // namespace ncnn +} // naemsapce nnfw + +#endif // __NCNN_LAYER_BINARYOP_H__ diff --git a/compute/ncnn/include/ncnn/layer/instance_norm.h b/compute/ncnn/include/ncnn/layer/instance_norm.h new file mode 100644 index 000000000..b7d89281d --- /dev/null +++ b/compute/ncnn/include/ncnn/layer/instance_norm.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef __NNFW_LAYER_INSTANCE_NORM_H_ +#define __NNFW_LAYER_INSTANCE_NORM_H_ + +#include "ncnn/mat.h" +#ifdef __ARM_NEON +#include <arm_neon.h> +#endif // __ARM_NEON + +namespace nnfw +{ +namespace ncnn +{ + +void ncnn_instance_norm_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps); + +void ncnn_instance_norm_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps); + +void ncnn_instance_norm_with_relu_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps, float slope); + +void ncnn_instance_norm_with_relu_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps, float slope); + +} // namespace ncnn + +} // namespace nnfw + +#endif // __NNFW_LAYER_INSTANCE_NORM_H_ diff --git a/compute/ncnn/include/ncnn/mat.h b/compute/ncnn/include/ncnn/mat.h new file mode 100644 index 000000000..2a577939d --- /dev/null +++ b/compute/ncnn/include/ncnn/mat.h @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_NCNN_MAT_H__ +#define __NNFW_NCNN_MAT_H__ + +#include <stdlib.h> +#include <string.h> +#if __ARM_NEON +#include <arm_neon.h> +#endif + +namespace nnfw +{ +namespace ncnn +{ + +// the three dimension matrix +class Mat +{ +public: + // empty + Mat(); + // vec + Mat(int w, size_t elemsize = 4); + // image + Mat(int w, int h, size_t elemsize = 4); + // dim + Mat(int w, int h, int c, size_t elemsize = 4); + // copy + Mat(const Mat &m); + // external vec + Mat(int w, void *data, size_t elemsize = 4); + // external image + Mat(int w, int h, void *data, size_t elemsize = 4); + // external dim + Mat(int w, int h, int c, void *data, size_t elemsize = 4); + // release + ~Mat(); + // assign + Mat &operator=(const Mat &m); + // set all + void fill(float v); + template <typename T> void fill(T v); + // deep copy + Mat clone() const; + // reshape vec + Mat reshape(int w) const; + // reshape image + Mat reshape(int w, int h) const; + // reshape dim + Mat reshape(int w, int h, int c) const; + // allocate vec + void create(int w, size_t elemsize = 4); + // allocate image + void create(int w, int h, size_t elemsize = 4); +// allocate dim +#ifdef _MEMORY_TO_TIME_ + void create(int w, int h, int c, size_t elemsize = 4, bool isNew = false); +#else + void create(int w, int h, int c, size_t elemsize = 4); +#endif +#ifdef USE_OPENCL_INSIDE + void create_empity_mat(int _w, int _h, int _c, size_t _elemsize); +#endif + + // refcount++ + void addref(); + // refcount-- + void release(); + + bool empty() const; + size_t total() const; + + // data reference + Mat channel(int c); + const Mat channel(int c) const; + float *row(int y); + const float *row(int y) const; + template <typename T> T *row(int y); + template <typename T> const T *row(int y) const; + + // access raw data + template <typename T> operator T *(); + template <typename T> operator const T *() const; + + // convenient access float vec element + float &operator[](int i); + const float &operator[](int i) const; + + enum + { + PIXEL_CONVERT_SHIFT = 16, + PIXEL_FORMAT_MASK = 0x0000ffff, + PIXEL_CONVERT_MASK = 0xffff0000, + + PIXEL_RGB = 1, + PIXEL_BGR = (1 << 1), + PIXEL_GRAY = (1 << 2), + PIXEL_RGBA = (1 << 3), + + PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + + PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + + PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + + PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT), + PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT), + PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT), + }; + +#ifdef _MEMORY_TO_TIME_ + static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h); + static void from_pixels(const unsigned char *pixels, Mat &m, int type, int w, int h, int top, + int bottom, int left, int right); +#endif // _MEMORY_TO_TIME_ + + // convenient construct from pixel data + static Mat from_pixels(const unsigned char *pixels, int type, int w, int h); + // convenient construct from pixel data and add the padding && only supports same PIXEL_RGB2BGR + // and PIXEL_BGR2RGB now + static Mat from_pixels(const unsigned char *pixels, int type, int w, int h, int top, int bottom, + int left, int right); + // convenient construct from pixel data and resize to specific size + static Mat from_pixels_resize(const unsigned char *pixels, int type, int w, int h, + int target_width, int target_height); + + // convenient export to pixel data + void to_pixels(unsigned char *pixels, int type); + // convenient export to pixel data and cut the padding && only supports same PIXEL_RGB2BGR and + // PIXEL_BGR2RGB now + void to_pixels(unsigned char *pixels, int type, int top, int bottom, int left, int right); + // convenient export to pixel data and resize to specific size + void to_pixels_resize(unsigned char *pixels, int type, int target_width, int target_height); + + // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip + void substract_mean_normalize(const float *mean_vals, const float *norm_vals); + + // convenient construct from half precisoin floating point data + static Mat from_float16(const unsigned short *data, int size); + + // pointer to the data + void *data; + + // pointer to the reference counter + // when points to user-allocated data, the pointer is NULL + int *refcount; + + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // the dimensionality + int dims; + + int w; + int h; + int c; + + size_t cstep; +}; + +// misc function +// image pixel bilinear resize +void resize_bilinear_c1(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); +void resize_bilinear_c3(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); +void resize_bilinear_c4(const unsigned char *src, int srcw, int srch, unsigned char *dst, int w, + int h); + +// mat process +enum +{ + BORDER_CONSTANT = 0, + BORDER_REPLICATE = 1, +}; +void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, + float v); +void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right); +void resize_bilinear(const Mat &src, Mat &dst, int w, int h); + +// the alignment of all the allocated buffers +#define MALLOC_ALIGN 16 + +// Aligns a pointer to the specified number of bytes +// ptr Aligned pointer +// n Alignment size that must be a power of two +template <typename _Tp> static inline _Tp *alignPtr(_Tp *ptr, int n = (int)sizeof(_Tp)) +{ + return (_Tp *)(((size_t)ptr + n - 1) & -n); +} + +// Aligns a buffer size to the specified number of bytes +// The function returns the minimum number that is greater or equal to sz and is divisible by n +// sz Buffer size to align +// n Alignment size that must be a power of two +static inline size_t alignSize(size_t sz, int n) { return (sz + n - 1) & -n; } + +static inline void *fastMalloc(size_t size) +{ + unsigned char *udata = (unsigned char *)malloc(size + sizeof(void *) + MALLOC_ALIGN); + if (!udata) + return 0; + unsigned char **adata = alignPtr((unsigned char **)udata + 1, MALLOC_ALIGN); + adata[-1] = udata; + return adata; +} + +static inline void fastFree(void *ptr) +{ + if (ptr) + { + unsigned char *udata = ((unsigned char **)ptr)[-1]; + free(udata); + } +} + +// exchange-add operation for atomic operations on reference counters +#if defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32) +// atomic increment on the linux version of the Intel(tm) compiler +#define NCNN_XADD(addr, delta) \ + (int)_InterlockedExchangeAdd(const_cast<void *>(reinterpret_cast<volatile void *>(addr)), delta) +#elif defined __GNUC__ +#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && \ + !defined __EMSCRIPTEN__ && !defined(__CUDACC__) +#ifdef __ATOMIC_ACQ_REL +#define NCNN_XADD(addr, delta) \ + __c11_atomic_fetch_add((_Atomic(int) *)(addr), delta, __ATOMIC_ACQ_REL) +#else +#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int) *)(addr), delta, 4) +#endif +#else +#if defined __ATOMIC_ACQ_REL && !defined __clang__ +// version for gcc >= 4.7 +#define NCNN_XADD(addr, delta) \ + (int)__atomic_fetch_add((unsigned *)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL) +#else +#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned *)(addr), (unsigned)(delta)) +#endif +#endif +#elif defined _MSC_VER && !defined RC_INVOKED +#include <intrin.h> +#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile *)addr, delta) +#else +static inline void NCNN_XADD(int *addr, int delta) +{ + int tmp = *addr; + *addr += delta; + return tmp; +} +#endif + +inline Mat::Mat() : data(0), refcount(0), elemsize(0), dims(0), w(0), h(0), c(0), cstep(0) {} + +inline Mat::Mat(int _w, size_t _elemsize) : data(0), refcount(0), dims(0) { create(_w, _elemsize); } + +inline Mat::Mat(int _w, int _h, size_t _elemsize) : data(0), refcount(0), dims(0) +{ + create(_w, _h, _elemsize); +} + +inline Mat::Mat(int _w, int _h, int _c, size_t _elemsize) : data(0), refcount(0), dims(0) +{ + create(_w, _h, _c, _elemsize); +} + +inline Mat::Mat(const Mat &m) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), dims(m.dims) +{ + if (refcount) + NCNN_XADD(refcount, 1); + + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; +} + +inline Mat::Mat(int _w, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(1) +{ + w = _w; + h = 1; + c = 1; + + cstep = w; +} + +inline Mat::Mat(int _w, int _h, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(2) +{ + w = _w; + h = _h; + c = 1; + + cstep = w * h; +} + +inline Mat::Mat(int _w, int _h, int _c, void *_data, size_t _elemsize) + : data(_data), refcount(0), elemsize(_elemsize), dims(3) +{ + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline Mat::~Mat() { release(); } + +inline Mat &Mat::operator=(const Mat &m) +{ + if (this == &m) + return *this; + + if (m.refcount) + NCNN_XADD(m.refcount, 1); + + release(); + + data = m.data; + refcount = m.refcount; + elemsize = m.elemsize; + + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; + + return *this; +} + +inline void Mat::fill(float _v) +{ + int size = total(); + float *ptr = (float *)data; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON + float32x4_t _c = vdupq_n_f32(_v); +#if __aarch64__ + if (nn > 0) + { + asm volatile("0: \n" + "subs %w0, %w0, #1 \n" + "st1 {%4.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "w"(_c) // %4 + : "cc", "memory"); + } +#else + if (nn > 0) + { + asm volatile("0: \n" + "subs %0, #1 \n" + "vst1.f32 {%e4-%f4}, [%1 :128]!\n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "w"(_c) // %4 + : "cc", "memory"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr++ = _v; + } +} + +template <typename T> inline void Mat::fill(T _v) +{ + int size = total(); + T *ptr = (T *)data; + for (int i = 0; i < size; i++) + { + ptr[i] = _v; + } +} + +inline Mat Mat::clone() const +{ + if (empty()) + return Mat(); + + Mat m; + if (dims == 1) + m.create(w, elemsize); + else if (dims == 2) + m.create(w, h, elemsize); + else if (dims == 3) + m.create(w, h, c, elemsize); + + if (total() > 0) + { + memcpy(m.data, data, total() * elemsize); + } + + return m; +} + +inline Mat Mat::reshape(int _w) const +{ + if (w * h * c != _w) + return Mat(); + + if (dims == 3 && cstep != (size_t)w * h) + { + Mat m; + m.create(_w, elemsize); + + // flatten + for (int i = 0; i < c; i++) + { + const void *ptr = (unsigned char *)data + i * cstep * elemsize; + void *mptr = (unsigned char *)m.data + i * w * h * elemsize; + memcpy(mptr, ptr, w * h * elemsize); + } + + return m; + } + + Mat m = *this; + + m.dims = 1; + m.w = _w; + m.h = 1; + m.c = 1; + + m.cstep = _w; + + return m; +} + +inline Mat Mat::reshape(int _w, int _h) const +{ + if (w * h * c != _w * _h) + return Mat(); + + if (dims == 3 && cstep != (size_t)w * h) + { + Mat m; + m.create(_w, _h, elemsize); + + // flatten + for (int i = 0; i < c; i++) + { + const void *ptr = (unsigned char *)data + i * cstep * elemsize; + void *mptr = (unsigned char *)m.data + i * w * h * elemsize; + memcpy(mptr, ptr, w * h * elemsize); + } + + return m; + } + + Mat m = *this; + + m.dims = 2; + m.w = _w; + m.h = _h; + m.c = 1; + + m.cstep = _w * _h; + + return m; +} + +inline Mat Mat::reshape(int _w, int _h, int _c) const +{ + if (w * h * c != _w * _h * _c) + return Mat(); + + if (dims < 3) + { + if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize) + { + Mat m; + m.create(_w, _h, _c, elemsize); + + // align channel + for (int i = 0; i < _c; i++) + { + const void *ptr = (unsigned char *)data + i * _w * _h * elemsize; + void *mptr = (unsigned char *)m.data + i * m.cstep * m.elemsize; + memcpy(mptr, ptr, _w * _h * elemsize); + } + + return m; + } + } + else if (c != _c) + { + // flatten and then align + Mat tmp = reshape(_w * _h * _c); + return tmp.reshape(_w, _h, _c); + } + + Mat m = *this; + + m.dims = 3; + m.w = _w; + m.h = _h; + m.c = _c; + + m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize; + + return m; +} + +inline void Mat::create(int _w, size_t _elemsize) +{ + if (dims == 1 && w == _w && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 1; + w = _w; + h = 1; + c = 1; + + cstep = w; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +inline void Mat::create(int _w, int _h, size_t _elemsize) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +#ifdef _MEMORY_TO_TIME_ +inline void Mat::create(int _w, int _h, int _c, size_t _elemsize, bool isNew) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + if (!isNew && dims == 3) + { + elemsize = _elemsize; + + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + return; + } + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} + +#else +inline void Mat::create(int _w, int _h, int _c, size_t _elemsize) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = total() * elemsize; + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int *)(((unsigned char *)data) + totalsize); + *refcount = 1; + } +} +#endif //_MEMORY_TO_TIME_ + +#ifdef USE_OPENCL_INSIDE +inline void Mat::create_empity_mat(int _w, int _h, int _c, size_t _elemsize) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize) + return; + + release(); + + elemsize = _elemsize; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + data = NULL; +} +#endif // USE_OPENCL_INSIDE + +inline void Mat::addref() +{ + if (refcount) + NCNN_XADD(refcount, 1); +} + +inline void Mat::release() +{ + if (refcount && NCNN_XADD(refcount, -1) == 1) + fastFree(data); + + data = 0; + + elemsize = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; + + cstep = 0; + + refcount = 0; +} + +inline bool Mat::empty() const { return data == 0 || total() == 0; } + +inline size_t Mat::total() const { return cstep * c; } + +inline Mat Mat::channel(int c) +{ + return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize); +} + +inline const Mat Mat::channel(int c) const +{ + return Mat(w, h, (unsigned char *)data + cstep * c * elemsize, elemsize); +} + +inline float *Mat::row(int y) { return (float *)data + w * y; } + +inline const float *Mat::row(int y) const { return (const float *)data + w * y; } + +template <typename T> inline T *Mat::row(int y) { return (T *)data + w * y; } + +template <typename T> inline const T *Mat::row(int y) const { return (const T *)data + w * y; } + +template <typename T> inline Mat::operator T *() { return (T *)data; } + +template <typename T> inline Mat::operator const T *() const { return (const T *)data; } + +inline float &Mat::operator[](int i) { return ((float *)data)[i]; } + +inline const float &Mat::operator[](int i) const { return ((const float *)data)[i]; } + +} // namespace ncnn +} // namespace nnfw + +#endif // __NNFW_NCNN_MAT_H__ diff --git a/compute/ncnn/include/ncnn/srcn/conv_type.h b/compute/ncnn/include/ncnn/srcn/conv_type.h new file mode 100644 index 000000000..59152a094 --- /dev/null +++ b/compute/ncnn/include/ncnn/srcn/conv_type.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_TYPE_H__ +#define __NNFW_SRCN_CONV_TYPE_H__ + +namespace nnfw +{ +namespace srcn +{ + +enum convType_t +{ + row_major = 0, + col_major +}; + +struct convMat_t +{ + int w; + int h; + int c; + int n; + float *data; +}; + +struct convParams_t +{ + int kernel_w; + int kernel_h; + int stride_w; + int stride_h; + int dilation_w; + int dilation_h; + int padding; + int pad_w; + int pad_h; +}; + +struct winogradParams_t +{ + int kernel_w; + int kernel_h; + int stride_w; + int stride_h; + int dilation_w; + int dilation_h; + int batch; + int w; + int h; + int inch; + int outch; + int num_threads; + convType_t conv_type; + float *weight_data; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_TYPE_H__ diff --git a/compute/ncnn/include/ncnn/srcn/srcn_conv.h b/compute/ncnn/include/ncnn/srcn/srcn_conv.h new file mode 100644 index 000000000..11130c0db --- /dev/null +++ b/compute/ncnn/include/ncnn/srcn/srcn_conv.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_H__ +#define __NNFW_SRCN_CONV_H__ + +#include "conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +int check_winograd(winogradParams_t ¶ms); + +float *trans_weight2winograd(winogradParams_t ¶ms, unsigned int *size = NULL); + +void winograd_release(float *winograd_weight); + +void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, const float *winograd_weight, int num_threads, + convType_t conv_type); + +void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, int num_threads, convType_t conv_type); + +void *trans_weight2sparse(const convMat_t &weights_mat); + +void sparse_release(const int outch, void *ptr); + +void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat, + const convParams_t &in_param, const void *sparse_weight, + int number_threas, convType_t conv_type); + +void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, + const float *winograd_weight, int num_threads, convType_t conv_type); + +void srcn_convolution2D_gpu(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, convType_t conv_type); + +void srcn_convolution2D_dpu(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, convType_t conv_type); + +void srcn_depthwise_conv(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convMat_t &bias, const convParams_t &in_param, int num_threads, + convType_t conv_type); + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_H__ diff --git a/compute/ncnn/src/layer/arm/neon_mathfun.h b/compute/ncnn/src/layer/arm/neon_mathfun.h new file mode 100644 index 000000000..6e3cb66c8 --- /dev/null +++ b/compute/ncnn/src/layer/arm/neon_mathfun.h @@ -0,0 +1,315 @@ +/* NEON implementation of sin, cos, exp and log + * + * Inspired by Intel Approximate Math library, and based on the + * corresponding algorithms of the cephes math library + */ + +/* Copyright (C) 2011 Julien Pommier + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * (this is the zlib license) + */ + +#include <arm_neon.h> + +#define c_inv_mant_mask ~0x7f800000u +#define c_cephes_SQRTHF 0.707106781186547524 +#define c_cephes_log_p0 7.0376836292E-2 +#define c_cephes_log_p1 -1.1514610310E-1 +#define c_cephes_log_p2 1.1676998740E-1 +#define c_cephes_log_p3 -1.2420140846E-1 +#define c_cephes_log_p4 +1.4249322787E-1 +#define c_cephes_log_p5 -1.6668057665E-1 +#define c_cephes_log_p6 +2.0000714765E-1 +#define c_cephes_log_p7 -2.4999993993E-1 +#define c_cephes_log_p8 +3.3333331174E-1 +#define c_cephes_log_q1 -2.12194440e-4 +#define c_cephes_log_q2 0.693359375 + +/* natural logarithm computed for 4 simultaneous float + * return NaN for x <= 0 + */ +static inline float32x4_t log_ps(float32x4_t x) +{ + float32x4_t one = vdupq_n_f32(1); + + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ + uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); + + int32x4_t ux = vreinterpretq_s32_f32(x); + + int32x4_t emm0 = vshrq_n_s32(ux, 23); + + /* keep only the fractional part */ + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); + x = vreinterpretq_f32_s32(ux); + + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); + float32x4_t e = vcvtq_f32_s32(emm0); + + e = vaddq_f32(e, one); + + /* part2: + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ + uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); + float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); + x = vsubq_f32(x, one); + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); + x = vaddq_f32(x, tmp); + + float32x4_t z = vmulq_f32(x, x); + + float32x4_t y = vdupq_n_f32(c_cephes_log_p0); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); + y = vmulq_f32(y, x); + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); + y = vmulq_f32(y, x); + + y = vmulq_f32(y, z); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); + y = vaddq_f32(y, tmp); + + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); + y = vsubq_f32(y, tmp); + + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); + x = vaddq_f32(x, y); + x = vaddq_f32(x, tmp); + x = vreinterpretq_f32_u32( + vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN + return x; +} + +#define c_exp_hi 88.3762626647949f +#define c_exp_lo -88.3762626647949f + +#define c_cephes_LOG2EF 1.44269504088896341 +#define c_cephes_exp_C1 0.693359375 +#define c_cephes_exp_C2 -2.12194440e-4 + +#define c_cephes_exp_p0 1.9875691500E-4 +#define c_cephes_exp_p1 1.3981999507E-3 +#define c_cephes_exp_p2 8.3334519073E-3 +#define c_cephes_exp_p3 4.1665795894E-2 +#define c_cephes_exp_p4 1.6666665459E-1 +#define c_cephes_exp_p5 5.0000001201E-1 + +/* exp() computed for 4 float at once */ +static inline float32x4_t exp_ps(float32x4_t x) +{ + float32x4_t tmp, fx; + + float32x4_t one = vdupq_n_f32(1); + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + uint32x4_t mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); + float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, + c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5}; + float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0); + float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1); + float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2); + float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3); + float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4); + float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5); + + y = vmulq_f32(y, x); + z = vmulq_f32(x, x); + + y = vaddq_f32(y, c1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, c5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, one); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); + mm = vshlq_n_s32(mm, 23); + float32x4_t pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +#define c_minus_cephes_DP1 -0.78515625 +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 +#define c_minus_cephes_DP3 -3.77489497744594108e-8 +#define c_sincof_p0 -1.9515295891E-4 +#define c_sincof_p1 8.3321608736E-3 +#define c_sincof_p2 -1.6666654611E-1 +#define c_coscof_p0 2.443315711809948E-005 +#define c_coscof_p1 -1.388731625493765E-003 +#define c_coscof_p2 4.166664568298827E-002 +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI + +/* evaluation of 4 sines & cosines at once. + * + * The code is the exact rewriting of the cephes sinf function. + * Precision is excellent as long as x < 8192 (I did not bother to + * take into account the special handling they have for greater values + * -- it does not return garbage for arguments over 8192, though, but + * the extra precision is missing). + * + * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + * surprising but correct result. + * + * Note also that when you compute sin(x), cos(x) is available at + * almost no extra price so both sin_ps and cos_ps make use of + * sincos_ps.. + */ +static inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) +{ + // any x + float32x4_t xmm1, xmm2, xmm3, y; + + uint32x4_t emm2; + + uint32x4_t sign_mask_sin, sign_mask_cos; + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); + x = vabsq_f32(x); + + /* scale by 4/Pi */ + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); + + /* store the integer part of y in mm0 */ + emm2 = vcvtq_u32_f32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); + y = vcvtq_f32_u32(emm2); + + /* get the polynom selection mask + * there is one polynom for 0 <= x <= Pi/4 + * and another one for Pi/4<x<=Pi/2 + * + * Both branches will be computed. + */ + uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2)); + + /* The magic pass: "Extended precision modular arithmetic" + * x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1); + xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2); + xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3); + x = vaddq_f32(x, xmm1); + x = vaddq_f32(x, xmm2); + x = vaddq_f32(x, xmm3); + + sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4))); + sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4)); + + /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, + * and the second polynom (Pi/4 <= x <= 0) in y2 */ + float32x4_t z = vmulq_f32(x, x); + float32x4_t y1, y2; + + y1 = vmulq_n_f32(z, c_coscof_p0); + y2 = vmulq_n_f32(z, c_sincof_p0); + y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1)); + y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1)); + y1 = vmulq_f32(y1, z); + y2 = vmulq_f32(y2, z); + y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2)); + y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2)); + y1 = vmulq_f32(y1, z); + y2 = vmulq_f32(y2, z); + y1 = vmulq_f32(y1, z); + y2 = vmulq_f32(y2, x); + y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f))); + y2 = vaddq_f32(y2, x); + y1 = vaddq_f32(y1, vdupq_n_f32(1)); + + /* select the correct result from the two polynoms */ + float32x4_t ys = vbslq_f32(poly_mask, y1, y2); + float32x4_t yc = vbslq_f32(poly_mask, y2, y1); + *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); + *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); +} + +static inline float32x4_t sin_ps(float32x4_t x) +{ + float32x4_t ysin, ycos; + sincos_ps(x, &ysin, &ycos); + return ysin; +} + +static inline float32x4_t cos_ps(float32x4_t x) +{ + float32x4_t ysin, ycos; + sincos_ps(x, &ysin, &ycos); + return ycos; +} + +static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) +{ + float32x4_t reciprocal = vrecpeq_f32(b); + reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + // reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); + return vmulq_f32(a, reciprocal); +} + +static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) +{ + // pow(x, m) = exp(m * log(x)) + return exp_ps(vmulq_f32(b, log_ps(a))); +} diff --git a/compute/ncnn/src/layer/binaryop.cc b/compute/ncnn/src/layer/binaryop.cc new file mode 100644 index 000000000..a09d55f78 --- /dev/null +++ b/compute/ncnn/src/layer/binaryop.cc @@ -0,0 +1,1640 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ncnn/layer/binaryop.h" +#include <math.h> +#include <algorithm> +#include <functional> +#include <sys/time.h> + +#if __ARM_NEON +#include <arm_neon.h> +#include "arm/neon_mathfun.h" +#endif // __ARM_NEON + +namespace nnfw +{ +namespace ncnn +{ + +template <typename Op> static int binary_op(const Mat &a, const Mat &b, Mat &c) +{ + Op op; + + int w = a.w; + int h = a.h; + int channels = a.c; + int size = w * h; + + int w1 = b.w; + int h1 = b.h; + int channels1 = b.c; + int size1 = w1 * h1; + + if (a.dims == 3) + { + c.create(w, h, channels); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + if (b.w == 1 && b.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = a.channel(q); + const float *ptr1 = b.channel(q); + float *outptr = c.channel(q); + + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + outptr[i] = op(ptr[i], tt); + } + } + + return 0; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = a.channel(q); + const float *ptr1 = b.channel(q); + float *outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = op(ptr[i], ptr1[i]); + } + } + + return 0; + } + + if (b.dims == 2) + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = a.channel(q); + const float *ptr1 = (const float *)b + h * q; + float *outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + const float b0 = ptr1[y]; + for (int x = 0; x < w; x++) + { + outptr[x] = op(ptr[x], b0); + } + + ptr += w; + outptr += w; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1) + { + const float b0 = b[0]; +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = a.channel(q); + float *outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = op(ptr[i], b0); + } + } + + return 0; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = a.channel(q); + const float b0 = b[q]; + float *outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = op(ptr[i], b0); + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 3) + { + c.create(w1, h1, channels1); + if (c.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float *ptr = (const float *)a + h1 * q; + const float *ptr1 = b.channel(q); + float *outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + const float a0 = ptr[y]; + for (int x = 0; x < w1; x++) + { + outptr[x] = op(a0, ptr1[x]); + } + + ptr1 += w1; + outptr += w1; + } + } + + return 0; + } + + c.create(w, h); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + for (int i = 0; i < size; i++) + { + c[i] = op(a[i], b[i]); + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, h); + if (c.empty()) + return -100; + + if (b.w == 1) + { + const float b0 = b[0]; + for (int i = 0; i < size; i++) + { + c[i] = op(a[i], b0); + } + + return 0; + } + + const float *ptr = a; + float *outptr = c; + + for (int y = 0; y < h; y++) + { + const float b0 = b[y]; + for (int x = 0; x < w; x++) + { + outptr[x] = op(ptr[x], b0); + } + + ptr += w; + outptr += w; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1) + { + if (b.dims == 3) + { + c.create(w1, h1, channels1); + if (c.empty()) + return -100; + + const float a0 = a[0]; +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float *ptr1 = b.channel(q); + float *outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + outptr[i] = op(a0, ptr1[i]); + } + } + + return 0; + } + + if (b.dims == 2) + { + c.create(w1, h1); + if (c.empty()) + return -100; + + const float a0 = a[0]; + for (int i = 0; i < size1; i++) + { + c[i] = op(a0, b[i]); + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w1); + if (c.empty()) + return -100; + + const float a0 = a[0]; + for (int i = 0; i < size1; i++) + { + c[i] = op(a0, b[i]); + } + + return 0; + } + } + + if (b.dims == 3) + { + c.create(w1, h1, channels1); + if (c.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = a[q]; + const float *ptr1 = b.channel(q); + float *outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + outptr[i] = op(a0, ptr1[i]); + } + } + + return 0; + } + + if (b.dims == 2) + { + c.create(w1, h1); + if (c.empty()) + return -100; + + const float *ptr1 = b; + float *outptr = c; + + for (int y = 0; y < h1; y++) + { + const float a0 = a[y]; + for (int x = 0; x < w1; x++) + { + outptr[x] = op(a0, ptr1[x]); + } + + ptr1 += w1; + outptr += w1; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w); + if (c.empty()) + return -100; + + if (b.w == 1) + { + const float b0 = b[0]; + for (int i = 0; i < size; i++) + { + c[i] = op(a[i], b0); + } + + return 0; + } + + for (int i = 0; i < size; i++) + { + c[i] = op(a[i], b[i]); + } + } + } + + return 0; +} + +template <typename Op> static int binary_op_scalar_inplace(Mat &a, float b) +{ + Op op; + + int w = a.w; + int h = a.h; + int channels = a.c; + int size = w * h; + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + float *ptr = a.channel(q); + + for (int i = 0; i < size; i++) + { + ptr[i] = op(ptr[i], b); + } + } + + return 0; +} + +template <typename T> struct binary_op_max : std::binary_function<T, T, T> +{ + T operator()(const T &x, const T &y) const { return std::max(x, y); } +}; + +template <typename T> struct binary_op_min : std::binary_function<T, T, T> +{ + T operator()(const T &x, const T &y) const { return std::min(x, y); } +}; + +template <typename T> struct binary_op_pow : std::binary_function<T, T, T> +{ + T operator()(const T &x, const T &y) const { return pow(x, y); } +}; + +template <typename T> struct binary_op_SquaredDifference : std::binary_function<T, T, T> +{ + T operator()(const T &x, const T &y) const { return pow((x - y), 2); } +}; + +int ncnn_binary_op(const BinaryOpParam ¶m, const Mat &bottom_blob, const Mat &bottom_blob1, + Mat &top_blob) +{ + int ret = 0; + auto op_type = param.op_type; + // auto b = param.b; + + // Only support add operation, none broadcasting + // Other case, need to remove internal memory allocation and check correctness + if (op_type != BinaryOp::Operation_ADD) + { + throw std::runtime_error{"NYI: Only support ADD operation"}; + } + if (bottom_blob.dims != bottom_blob1.dims) + { + throw std::runtime_error{"NYI: Cannot use broadcasting"}; + } + +// printf("-------------------BinaryOp---------------\n"); + +// printf("op_type = %d, ", op_type); +// printf("in1: (%d, %d, %d), dims = %d, ", bottom_blob.w, bottom_blob.h, bottom_blob.c, +// bottom_blob.dims); +// printf("in2: (%d, %d, %d), dims = %d\n", bottom_blob1.w, bottom_blob1.h, bottom_blob1.c, +// bottom_blob1.dims); + +#if __ARM_NEON + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int w1 = bottom_blob1.w; + int h1 = bottom_blob1.h; + int channels1 = bottom_blob1.c; + int size1 = w1 * h1; + + if (op_type == BinaryOp::Operation_ADD) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + // Fix for nnfw: disable allocation for output + // top_blob.create(w, h, channels); + if (bottom_blob1.w == 1 && bottom_blob1.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + float tt = *ptr1; + + float32x4_t _p2 = vdupq_n_f32(tt); + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + + _p1 = vaddq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 + tt); + in1++; + out++; + } + +#else + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + outptr[i] = (ptr[i] + tt); + } +#endif + } + + ret = 0; + } + else + { + if (size * bottom_blob.elemsize % 16 != 0) + { + throw std::runtime_error{"Unmatched alignment"}; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = vaddq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = *in1 + *in2; + in1++; + in2++; + out++; + } + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + float *pt = (float *)bottom_blob1.data; + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = pt[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + _p1 = vaddq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 + b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = bottom_blob[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + _p1 = vaddq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (a0 + *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); + } + +#if 0 // Disable operation except Operation_ADD + + if (op_type == BinaryOp::Operation_SUB) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + top_blob.create(w, h, channels); + + if (bottom_blob1.w == 1 && bottom_blob1.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + float tt = *ptr1; + + float32x4_t _p2 = vdupq_n_f32(tt); + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + + _p1 = vsubq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 - tt); + in1++; + out++; + } + +#else + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + outptr[i] = (ptr[i] - tt); + } +#endif + } + + ret = 0; + } + else + { + top_blob.create(w, h, channels); +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = vsubq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = *in1 - *in2; + in1++; + in2++; + out++; + } + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = bottom_blob1[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + _p1 = vsubq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 - b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = bottom_blob[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + _p1 = vsubq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (a0 - *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); + } + + if (op_type == BinaryOp::Operation_MUL) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + top_blob.create(w, h, channels); + + if (bottom_blob1.w == 1 && bottom_blob1.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + float tt = *ptr1; + + float32x4_t _p2 = vdupq_n_f32(tt); + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 * tt); + in1++; + out++; + } + +#else + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + outptr[i] = (ptr[i] * tt); + } +#endif + } + + ret = 0; + } + else + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = *in1 * *in2; + in1++; + in2++; + out++; + } + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = bottom_blob1[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 * b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + + if (bottom_blob.w != bottom_blob1.c) + { + ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); + goto out; + } + + float *pt = (float *)bottom_blob.data; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = pt[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (a0 * *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); + } + + if (op_type == BinaryOp::Operation_DIV) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1 && bottom_blob1.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + float tt = *ptr1; + + float32x4_t _p2 = vdupq_n_f32(tt); + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + + float32x4_t _p3 = vrecpeq_f32(_p2); + _p3 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); + _p1 = vmulq_f32(_p1, _p3); + + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 / tt); + in1++; + out++; + } + +#else + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + outptr[i] = (ptr[i] / tt); + } +#endif + } + + // return 0; + goto out; + } + else + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + float32x4_t _p3 = vrecpeq_f32(_p2); + _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = *in1 / *in2; + in1++; + in2++; + out++; + } + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = bottom_blob1[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + //_p1 = vsubq_f32(_p1, _p2); + float32x4_t _p3 = vrecpeq_f32(_p2); + _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 / b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = bottom_blob[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + //_p1 = vsubq_f32(_p1, _p2); + float32x4_t _p3 = vrecpeq_f32(_p2); + _p2 = vmulq_f32(vrecpsq_f32(_p2, _p3), _p3); + _p1 = vmulq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (a0 / *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); + } + + if (op_type == BinaryOp::Operation_MAX) + ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_MIN) + ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_POW) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + top_blob.create(w, h, channels); +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = pow_ps(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = pow(*in1, *in2); + in1++; + in2++; + out++; + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = bottom_blob1[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + _p1 = pow_ps(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = pow(*in1, b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = bottom_blob[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + _p1 = pow_ps(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = pow(a0, *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); + } + + if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) + { + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { + top_blob.create(w, h, channels); + + if (bottom_blob1.w == 1 && bottom_blob1.h == 1) + { + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + float tt = *ptr1; + + float32x4_t _p2 = vdupq_n_f32(tt); + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + + _p1 = vsubq_f32(_p1, _p2); + _p1 = vmulq_f32(_p1, _p1); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + float t2 = *in1 - tt; + *out = t2 * t2; + in1++; + out++; + } + +#else + float tt = *ptr1; + for (int i = 0; i < size; i++) + { + float t2 = (ptr[i] - tt); + outptr[i] = t2 * t2; + } +#endif + } + + ret = 0; + } + else + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = vsubq_f32(_p1, _p2); + _p1 = vmulq_f32(_p1, _p1); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 - *in2) * (*in1 - *in2); + in1++; + in2++; + out++; + } + } + } + } + else if (bottom_blob.dims == 3 && bottom_blob1.dims == 1) + { + top_blob.create(w, h, channels); + if (bottom_blob1.w == 1) + { + ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); + // return ret; + goto out; + } + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float *ptr = bottom_blob.channel(q); + const float b0 = bottom_blob1[q]; + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vdupq_n_f32(b0); + + _p1 = vsubq_f32(_p1, _p2); + _p1 = vmulq_f32(_p1, _p1); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (*in1 - b0) * (*in1 - b0); + in1++; + out++; + } + } + } + else if (bottom_blob.dims == 1 && bottom_blob1.dims == 3) + { + top_blob.create(w1, h1, channels1); + if (top_blob.empty()) + return -100; + +#pragma omp parallel for + for (int q = 0; q < channels1; q++) + { + const float a0 = bottom_blob[q]; + const float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size1 >> 2; + int remain = size1 - (nn << 2); + + float *in1 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vdupq_n_f32(a0); + float32x4_t _p2 = vld1q_f32(in1); + + _p1 = vsubq_f32(_p1, _p2); + _p1 = vmulq_f32(_p1, _p1); + vst1q_f32(out, _p1); + in1 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = (a0 - *in1) * (a0 - *in1); + in1++; + out++; + } + } + } + else + ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); + } + +#endif // 0 (Disable operation except Operation_ADD) + +#else + + if (op_type == BinaryOp::Operation_ADD) + ret = binary_op<std::plus<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_SUB) + ret = binary_op<std::minus<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_MUL) + ret = binary_op<std::multiplies<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_DIV) + ret = binary_op<std::divides<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_MAX) + ret = binary_op<binary_op_max<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_MIN) + ret = binary_op<binary_op_min<float>>(bottom_blob, bottom_blob1, top_blob); + + if (op_type == BinaryOp::Operation_POW) + ret = binary_op<binary_op_pow<float>>(bottom_blob, bottom_blob1, top_blob); + if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) + ret = binary_op<binary_op_SquaredDifference<float>>(bottom_blob, bottom_blob1, top_blob); +#endif + +/* +for (int p = 0; p < top_blob.c && p < 5; p++) +{ + float* outptr = top_blob.channel(p); + printf("channel: %d\n", p); + for (int i = 0; i < 1; i++) + { + for (int j = 0; j < 5; j++) + { + printf("%f ", outptr[j]); + } + printf("\n"); + outptr += top_blob.w; + } +} +printf("----------------------------\n"); +*/ + +out: + return ret; +} + +int ncnn_binary_op_inplace(const BinaryOpParam ¶m, Mat &bottom_top_blob) +{ + auto op_type = param.op_type; + auto b = param.b; + + // printf("-------------------BinaryOp-----forward_inplace----------\n"); + if (op_type == BinaryOp::Operation_ADD) + return binary_op_scalar_inplace<std::plus<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_SUB) + return binary_op_scalar_inplace<std::minus<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_MUL) + return binary_op_scalar_inplace<std::multiplies<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_DIV) + return binary_op_scalar_inplace<std::divides<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_MAX) + return binary_op_scalar_inplace<binary_op_max<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_MIN) + return binary_op_scalar_inplace<binary_op_min<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_POW) + return binary_op_scalar_inplace<binary_op_pow<float>>(bottom_top_blob, b); + + if (op_type == BinaryOp::Operation_SQUAREDDIFFERENCE) + return binary_op_scalar_inplace<binary_op_SquaredDifference<float>>(bottom_top_blob, b); + + return 0; +} + +int ncnn_binary_op_inplace(const BinaryOpParam ¶m, Mat &bottom_blob, Mat &bottom_top_blob) +{ + int ret = 0; + + Mat &bottom_blob1 = bottom_top_blob; + Mat &top_blob = bottom_top_blob; + auto op_type = param.op_type; + + if (op_type == BinaryOp::Operation_ADD) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + +// Unused variables +// int w1 = bottom_blob1.w; +// int h1 = bottom_blob1.h; +// int channels1 = bottom_blob1.c; +// int size1 = w1 * h1; + +#if __ARM_NEON + + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + float *ptr = bottom_blob.channel(q); + float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + int nn = size >> 2; + int remain = size - (nn << 2); + + float *in1 = const_cast<float *>(ptr); + float *in2 = const_cast<float *>(ptr1); + float *out = const_cast<float *>(outptr); + + for (; nn > 0; nn--) + { + float32x4_t _p1 = vld1q_f32(in1); + float32x4_t _p2 = vld1q_f32(in2); + + _p1 = vaddq_f32(_p1, _p2); + vst1q_f32(out, _p1); + in1 += 4; + in2 += 4; + out += 4; + } + for (; remain > 0; remain--) + { + *out = *in1 + *in2; + in1++; + in2++; + out++; + } + } + } +#else + if (bottom_blob.dims == 3 && bottom_blob1.dims == 3) + { +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + float *ptr = bottom_blob.channel(q); + float *ptr1 = bottom_blob1.channel(q); + float *outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = ptr[i] + ptr1[i]; + } + } + return 0; + } +#endif + } + else + { + return -1; + } + return ret; +} + +} // namespace ncnn +} // namespace ncnn diff --git a/compute/ncnn/src/layer/instance_norm.cc b/compute/ncnn/src/layer/instance_norm.cc new file mode 100644 index 000000000..08c3f2c23 --- /dev/null +++ b/compute/ncnn/src/layer/instance_norm.cc @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ncnn/layer/instance_norm.h" +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <math.h> +#include "ncnn/mat.h" +#ifdef __ARM_NEON +#include <arm_neon.h> +#endif // __ARM_NEON + +namespace nnfw +{ +namespace ncnn +{ + +void ncnn_instance_norm_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps) +{ + // x = (x - mean) / (sqrt(var) + eps) * gamma + beta + + int w = in_mat.w; + int h = in_mat.h; + int size = w * h; +#ifdef __ARM_NEON + int nn = size >> 2; + int left4 = size & 3; +#endif + +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { +#ifdef __ARM_NEON + float *in_ptr = in_mat.channel(q); + float *out_ptr = out_mat.channel(q); + float32x4_t _sum = vdupq_n_f32(0.f); + float32x4_t _sq_sum = vdupq_n_f32(0.f); + for (int n = nn; n > 0; n--) + { + float32x4_t _p = vld1q_f32(in_ptr); + _sum = vaddq_f32(_sum, _p); + _p = vmulq_f32(_p, _p); + _sq_sum = vaddq_f32(_sq_sum, _p); + in_ptr += 4; + } + float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1); + sum += vgetq_lane_f32(_sum, 2); + sum += vgetq_lane_f32(_sum, 3); + float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1); + sqsum += vgetq_lane_f32(_sq_sum, 2); + sqsum += vgetq_lane_f32(_sq_sum, 3); + + for (int left = left4; left > 0; left--) + { + sum += *in_ptr; + sqsum += (*in_ptr) * (*in_ptr); + in_ptr++; + } + + float mean = sum / size; + float var = sqsum / size - mean * mean; + float gamma = gamma_mat[q]; + float beta = beta_mat[q]; + float a = gamma / (sqrt(var + eps)); + float b = -mean * a + beta; + + in_ptr = in_mat.channel(q); + float32x4_t _a = vdupq_n_f32(a); + float32x4_t _b = vdupq_n_f32(b); + for (int n = nn; n > 0; n--) + { + float32x4_t _p = vld1q_f32(in_ptr); + _p = vmulq_f32(_p, _a); + _p = vaddq_f32(_p, _b); + vst1q_f32(out_ptr, _p); + in_ptr += 4; + out_ptr += 4; + } + for (int left = left4; left > 0; left--) + { + *out_ptr = (*in_ptr) * a + b; + in_ptr++; + out_ptr++; + } +#else + float *in_ptr = in_mat.channel(q); + float *out_ptr = out_mat.channel(q); + // mean and var + float sum = 0.f; + float sqsum = 0.f; + for (int i = 0; i < size; i++) + { + sum += in_ptr[i]; + sqsum += in_ptr[i] * in_ptr[i]; + } + float mean = sum / size; + float var = sqsum / size - mean * mean; + + float gamma = gamma_mat[q]; + float beta = beta_mat[q]; + + float a = gamma / (sqrt(var + eps)); + float b = -mean * a + beta; + for (int i = 0; i < size; i++) + { + out_ptr[i] = in_ptr[i] * a + b; + } +#endif + } +} + +void ncnn_instance_norm_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int /*channels*/, float eps) +{ + // Treat CHW layout as HWC layout + int h = in_mat.c; + int w = in_mat.h; + int c = in_mat.w; + + int size = w * h; + int total = size * c; + + float sum[c] = {}; + float sqsum[c] = {}; + + float mean[c] = {}; + float var[c] = {}; + float a[c] = {}; + float b[c] = {}; + + float *in_ptr = in_mat.channel(0); + float *out_ptr = out_mat.channel(0); + +#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided) + for (int i = 0; i < total; i += c) + { + for (int j = 0; j < c; j++) + { + sum[j] += in_ptr[i + j]; + sqsum[j] += in_ptr[i + j] * in_ptr[i + j]; + } + } + + for (int i = 0; i < c; i++) + { + mean[i] = sum[i] / size; + var[i] = sqsum[i] / size - mean[i] * mean[i]; + a[i] = gamma_mat[i] / (sqrt(var[i] + eps)); + b[i] = -mean[i] * a[i] + beta_mat[i]; + } + +#pragma omp parallel for schedule(guided) + for (int i = 0; i < total; i += c) + { + for (int j = 0; j < c; j++) + { + out_ptr[i + j] = in_ptr[i + j] * a[j] + b[j]; + } + } +} + +void ncnn_instance_norm_with_relu_rowmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int channels, float eps, float /*slope*/) +{ + int w = in_mat.w; + int h = in_mat.h; + int size = w * h; +#ifdef __ARM_NEON + int nn = size >> 2; + int left4 = size & 3; +#endif +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { +#ifdef __ARM_NEON + float *in_ptr = in_mat.channel(q); + float *out_ptr = out_mat.channel(q); + float32x4_t _sum = vdupq_n_f32(0.f); + float32x4_t _sq_sum = vdupq_n_f32(0.f); + for (int n = nn; n > 0; n--) + { + float32x4_t _p = vld1q_f32(in_ptr); + _sum = vaddq_f32(_sum, _p); + _p = vmulq_f32(_p, _p); + _sq_sum = vaddq_f32(_sq_sum, _p); + in_ptr += 4; + } + // float sum = + // vgetq_lane_f32(_sum,0)+vgetq_lane_f32(_sum,1)+vgetq_lane_f32(_sum,2)+vgetq_lane_f32(_sum,3); + // float sqsum = vgetq_lane_f32(_sq_sum,0)+vgetq_lane_f32(_sq_sum,1)+ + // vgetq_lane_f32(_sq_sum,2)+vgetq_lane_f32(_sq_sum,3); + float sum = vgetq_lane_f32(_sum, 0) + vgetq_lane_f32(_sum, 1); + sum += vgetq_lane_f32(_sum, 2); + sum += vgetq_lane_f32(_sum, 3); + float sqsum = vgetq_lane_f32(_sq_sum, 0) + vgetq_lane_f32(_sq_sum, 1); + sqsum += vgetq_lane_f32(_sq_sum, 2); + sqsum += vgetq_lane_f32(_sq_sum, 3); + for (int left = left4; left > 0; left--) + { + sum += *in_ptr; + sqsum += (*in_ptr) * (*in_ptr); + in_ptr++; + } + + float mean = sum / size; + float var = sqsum / size - mean * mean; + float gamma = gamma_mat[q]; + float beta = beta_mat[q]; + float a = gamma / (sqrt(var + eps)); + float b = -mean * a + beta; + // TODO:slop is not used here , only for RELU which slop is always = 0; + in_ptr = in_mat.channel(q); + float32x4_t _a = vdupq_n_f32(a); + float32x4_t _b = vdupq_n_f32(b); + float32x4_t _zero = vdupq_n_f32(0.f); + for (int n = nn; n > 0; n--) + { + float32x4_t _p = vld1q_f32(in_ptr); + _p = vmulq_f32(_p, _a); + _p = vaddq_f32(_p, _b); + _p = vmaxq_f32(_p, _zero); + vst1q_f32(out_ptr, _p); + in_ptr += 4; + out_ptr += 4; + } + for (int left = left4; left > 0; left--) + { + int temp = (*in_ptr) * a + b; + *out_ptr = temp > 0 ? temp : 0; + in_ptr++; + out_ptr++; + } +#else + float *in_ptr = in_mat.channel(q); + float *out_ptr = out_mat.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + for (int i = 0; i < size; i++) + { + sum += in_ptr[i]; + sqsum += in_ptr[i] * in_ptr[i]; + } + float mean = sum / size; + float var = sqsum / size - mean * mean; + + float gamma = gamma_mat[q]; + float beta = beta_mat[q]; + + float a = gamma / (sqrt(var + eps)); + float b = -mean * a + beta; + + if (slope == 0.f) + { + for (int i = 0; i < size; i++) + { + float temp = in_ptr[i] * a + b; + out_ptr[i] = temp > 0 ? temp : 0; + } + } + else + { + for (int i = 0; i < size; i++) + { + float temp = in_ptr[i] * a + b; + out_ptr[i] = temp > 0 ? temp : temp * slope; + } + } +#endif + } +} + +void ncnn_instance_norm_with_relu_colmajor(Mat &in_mat, Mat &out_mat, Mat &gamma_mat, Mat &beta_mat, + int /*channels*/, float eps, float slope) +{ + // Treat CHW layout as HWC layout + int h = in_mat.c; + int w = in_mat.h; + int c = in_mat.w; + + int size = w * h; + int total = size * c; + + float sum[c] = {}; + float sqsum[c] = {}; + + float mean[c] = {}; + float var[c] = {}; + float a[c] = {}; + float b[c] = {}; + + float *in_ptr = in_mat.channel(0); + float *out_ptr = out_mat.channel(0); + +#pragma omp parallel for reduction(+ : sum, sqsum) schedule(guided) + for (int i = 0; i < total; i += c) + { + for (int j = 0; j < c; j++) + { + sum[j] += in_ptr[i + j]; + sqsum[j] += in_ptr[i + j] * in_ptr[i + j]; + } + } + + for (int i = 0; i < c; i++) + { + mean[i] = sum[i] / size; + var[i] = sqsum[i] / size - mean[i] * mean[i]; + a[i] = gamma_mat[i] / (sqrt(var[i] + eps)); + b[i] = -mean[i] * a[i] + beta_mat[i]; + } + + if (slope == 0.f) + { +#pragma omp parallel for schedule(guided) + for (int i = 0; i < total; i += c) + { + for (int j = 0; j < c; j++) + { + float temp = in_ptr[i + j] * a[j] + b[j]; + out_ptr[i + j] = temp > 0 ? temp : 0; + } + } + } + else + { +#pragma omp parallel for schedule(guided) + for (int i = 0; i < total; i += c) + { + for (int j = 0; j < c; j++) + { + float temp = in_ptr[i + j] * a[j] + b[j]; + out_ptr[i + j] = temp > 0 ? temp : temp * slope; + } + } + } +} + +} // namespace ncnn + +} // namespace nnfw diff --git a/compute/ncnn/src/mat.cc b/compute/ncnn/src/mat.cc new file mode 100644 index 000000000..568378ef7 --- /dev/null +++ b/compute/ncnn/src/mat.cc @@ -0,0 +1,940 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ncnn/mat.h" + +#if __ARM_NEON +#include <arm_neon.h> +#endif // __ARM_NEON + +// Fix for nnfw: comment out cpu.h +//#include "cpu.h" + +namespace nnfw +{ +namespace ncnn +{ + +void Mat::substract_mean_normalize(const float *mean_vals, const float *norm_vals) +{ + int size = w * h; + + if (mean_vals && !norm_vals) + { +// substract mean only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr -= mean; + ptr++; + } + } + } + else if (!mean_vals && norm_vals) + { +// normalize only +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vmul.f32 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr *= norm; + ptr++; + } + } + } + else if (mean_vals && norm_vals) + { +// substract mean and normalize +#pragma omp parallel for + for (int q = 0; q < c; q++) + { + float *ptr = channel(q); // data + cstep * q; + const float mean = mean_vals[q]; + const float norm = norm_vals[q]; + +#if __ARM_NEON + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + if (nn > 0) + { + asm volatile("dup v1.4s, %w4 \n" + "dup v2.4s, %w5 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "fmul v0.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "v0", "v1", "v2"); + } +#else + if (nn > 0) + { + asm volatile("vdup.f32 q1, %4 \n" + "vdup.f32 q2, %5 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1 :128] \n" + "vsub.f32 q0, q0, q1 \n" + "vmul.f32 q0, q0, q2 \n" + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "q0", "q1", "q2"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = (*ptr - mean) * norm; + ptr++; + } + } + } +} + +// convert half precision floating point to float +static float half2float(unsigned short value) +{ + // 1 : 5 : 10 + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + // fprintf(stderr, "%d %d %d\n", sign, exponent, significand); + + // 1 : 8 : 23 + union { + unsigned int u; + float f; + } tmp; + if (exponent == 0) + { + if (significand == 0) + { + // zero + tmp.u = (sign << 31); + } + else + { + // denormal + exponent = 0; + // find non-zero bit + while ((significand & 0x200) == 0) + { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } + else if (exponent == 0x1F) + { + // infinity or NaN + tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); + } + else + { + // normalized + tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + + return tmp.f; +} + +Mat Mat::from_float16(const unsigned short *data, int size) +{ + Mat m(size); + if (m.empty()) + return m; + + float *ptr = m; //.data; + +#if __ARM_NEON && (__ARM_FP & 2) + // Fix for nnfw: Alway support vfpv4 + // int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0; + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON && (__ARM_FP & 2) +#if __aarch64__ + if (nn > 0) + { + asm volatile("0: \n" + "ld1 {v0.4h}, [%1], #8 \n" + "fcvtl v1.4s, v0.4h \n" + "subs %w0, %w0, #1 \n" + "st1 {v1.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "v0", "v1"); + } +#else + if (nn > 0) + { + asm volatile("0: \n" + "pld [%1, #64] \n" + "vld1.s16 {d0}, [%1 :64]! \n" + "vcvt.f32.f16 q1, d0 \n" + "subs %0, #1 \n" + "vst1.f32 {d2-d3}, [%2 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(data), // %1 + "=r"(ptr) // %2 + : "0"(nn), "1"(data), "2"(ptr) + : "cc", "memory", "q0", "q1"); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain > 0; remain--) + { + *ptr = half2float(*data); + + data++; + ptr++; + } + + return m; +} + +static void copy_make_border_image(const Mat &src, Mat &dst, int top, int left, int type, float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; //.data; + float *outptr = dst; //.data; + + if (type == BORDER_CONSTANT) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = v; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = v; + } + ptr += src.w; + outptr += w; + } + // fill bottom + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } + else if (type == BORDER_REPLICATE) + { + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + // fill center + for (; y < (top + src.h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + ptr += src.w; + outptr += w; + } + // fill bottom + ptr -= src.w; + for (; y < h; y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = ptr[0]; + } + if (src.w < 12) + { + for (; x < (left + src.w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, src.w * sizeof(float)); + x += src.w; + } + for (; x < w; x++) + { + outptr[x] = ptr[src.w - 1]; + } + outptr += w; + } + } +} + +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) +static void copy_make_border_image_inplace(const Mat &src, Mat &dst, int top, int left, int type, + float v) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src; + float *outptr = dst; + + if (type == BORDER_CONSTANT) + { + // fill bottom + int y = src.h + top; + outptr += y * w; + for (; y < h; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + + // fill center + y = src.h + top - 1; + outptr = dst; + outptr += y * w; + ptr += (src.h - 1) * src.w; + + for (; y >= top; y--) + { + int x = left + src.w; + for (; x < w; x++) + { + outptr[x] = v; + } + + x = left + src.w - 1; + + for (; x >= left; x--) + { + outptr[x] = ptr[x - left]; + } + + for (x = 0; x < left; x++) + { + outptr[x] = v; + } + ptr -= src.w; + outptr -= w; + } + + // fill top + y = 0; + outptr = dst; + for (; y < top; y++) + { + int x = 0; + for (; x < w; x++) + { + outptr[x] = v; + } + outptr += w; + } + } +} +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + +void copy_make_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right, int type, + float v) +{ + int w = src.w + left + right; + int h = src.h + top + bottom; + + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + copy_make_border_image(src, dst, top, left, type, v); + } + else if (src.dims == 3) + { + int channels = src.c; + dst.create(w, h, channels); + if (dst.empty()) + return; + + if (src.data != dst.data) + { +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } + } + else + { +#if defined(_MEMORY_TO_TIME_) && defined(_TIME_TO_MEMORY_) + for (int q = channels - 1; q >= 0; q--) + { + Mat m = src.channel(q); + Mat borderm = dst.channel(q); + copy_make_border_image_inplace(m, borderm, top, left, type, v); + } +#else +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat borderm = dst.channel(q); + + copy_make_border_image(m, borderm, top, left, type, v); + } +#endif // _MEMORY_TO_TIME_ && _TIME_TO_MEMORY_ + } + } +} + +static void copy_cut_border_image(const Mat &src, Mat &dst, int top, int left) +{ + int w = dst.w; + int h = dst.h; + + const float *ptr = src.row(top) + left; //.data + src.w * top + left; + float *outptr = dst; //.data; + + for (int y = 0; y < h; y++) + { + if (w < 12) + { + for (int x = 0; x < w; x++) + { + outptr[x] = ptr[x]; + } + } + else + { + memcpy(outptr, ptr, w * sizeof(float)); + } + outptr += w; + ptr += src.w; + } +} + +void copy_cut_border(const Mat &src, Mat &dst, int top, int bottom, int left, int right) +{ + int w = src.w - left - right; + int h = src.h - top - bottom; + +#ifndef _MEMORY_TO_TIME_ + if (w == src.w && h == src.h) + { + dst = src; + return; + } +#endif + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + copy_cut_border_image(src, dst, top, left); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +#if !defined(_MEMORY_TO_TIME_) || !defined(_TIME_TO_MEMORY_) +// unroll image channel +#pragma omp parallel for +#endif + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat cutm = dst.channel(q); + + copy_cut_border_image(m, cutm, top, left); + } + } +} + +static void resize_bilinear_image(const Mat &src, Mat &dst, int w, int h) +{ + double scale_x = (double)src.w / w; + double scale_y = (double)src.h / h; + + int *buf = new int[w + h + w * 2 + h * 2]; + + int *xofs = buf; // new int[w]; + int *yofs = buf + w; // new int[h]; + + float *alpha = (float *)(buf + w + h); // new float[w * 2]; + float *beta = (float *)(buf + w + h + w * 2); // new float[h * 2]; + + float fx; + float fy; + int sx; + int sy; + + for (int dx = 0; dx < w; dx++) + { + fx = (float)((dx + 0.5) * scale_x - 0.5); + sx = fx; // cvFloor(fx); + fx -= sx; + + if (sx >= src.w - 1) + { + sx = src.w - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + alpha[dx * 2] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } + + for (int dy = 0; dy < h; dy++) + { + fy = (float)((dy + 0.5) * scale_y - 0.5); + sy = fy; // cvFloor(fy); + fy -= sy; + + if (sy >= src.h - 1) + { + sy = src.h - 2; + fy = 1.f; + } + + yofs[dy] = sy; + + beta[dy * 2] = 1.f - fy; + beta[dy * 2 + 1] = fy; + } + + // loop body + Mat rowsbuf0(w + 1); + Mat rowsbuf1(w + 1); + float *rows0 = rowsbuf0; + float *rows1 = rowsbuf1; + + int prev_sy1 = -1; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // hresize one row + float *rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S1p = S1 + sx; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + else + { + // hresize two rows + const float *S0 = src.row(sy); + const float *S1 = src.row(sy + 1); + + const float *alphap = alpha; + float *rows0p = rows0; + float *rows1p = rows1; + int dx = 0; +#if __ARM_NEON + for (; dx + 1 < w; dx += 2) + { + int sx = xofs[dx]; + int sxn = xofs[dx + 1]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + const float *S0np = S0 + sxn; + const float *S1np = S1 + sxn; + + float32x4_t _a = vld1q_f32(alphap); + float32x2_t _S0 = vld1_f32(S0p); + float32x2_t _S1 = vld1_f32(S1p); + float32x2_t _S0n = vld1_f32(S0np); + float32x2_t _S1n = vld1_f32(S1np); + + float32x4_t _S0S0n = vcombine_f32(_S0, _S0n); + float32x4_t _S1S1n = vcombine_f32(_S1, _S1n); + float32x4_t _ms0 = vmulq_f32(_S0S0n, _a); + float32x4_t _ms1 = vmulq_f32(_S1S1n, _a); + float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0)); + float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1)); + + vst1_f32(rows0p + dx, _rows0); + vst1_f32(rows1p + dx, _rows1); + + alphap += 4; + } +#endif // __ARM_NEON + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float *S0p = S0 + sx; + const float *S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + + prev_sy1 = sy + 1; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + + float *rows0p = rows0; + float *rows1p = rows1; + float *Dp = dst.row(dy); + +#if __ARM_NEON + int nn = w >> 3; +#else + int nn = 0; +#endif + int remain = w - (nn << 3); + +#if __ARM_NEON + float32x4_t _b0 = vdupq_n_f32(b0); + float32x4_t _b1 = vdupq_n_f32(b1); + for (; nn > 0; nn--) + { + float32x4_t _rows0 = vld1q_f32(rows0p); + float32x4_t _rows1 = vld1q_f32(rows1p); + + float32x4_t _D = vmulq_f32(_rows0, _b0); + _D = vmlaq_f32(_D, _rows1, _b1); + + vst1q_f32(Dp, _D); + + float32x4_t _rows0n = vld1q_f32(rows0p + 4); + float32x4_t _rows1n = vld1q_f32(rows1p + 4); + + float32x4_t _Dn = vmulq_f32(_rows0n, _b0); + _Dn = vmlaq_f32(_Dn, _rows1n, _b1); + + vst1q_f32(Dp + 4, _Dn); + + Dp += 8; + rows0p += 8; + rows1p += 8; + } +#endif // __ARM_NEON + for (; remain; --remain) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } + + beta += 2; + } + + delete[] buf; +} + +void resize_bilinear(const Mat &src, Mat &dst, int w, int h) +{ + if (w == src.w && h == src.h) + { + dst = src; + return; + } + + if (src.dims == 2) + { + dst.create(w, h); + if (dst.empty()) + return; + + resize_bilinear_image(src, dst, w, h); + } + else if (src.dims == 3) + { + int channels = src.c; + + dst.create(w, h, channels); + if (dst.empty()) + return; + +// unroll image channel +#pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const Mat m = src.channel(q); + Mat resizem = dst.channel(q); + + resize_bilinear_image(m, resizem, w, h); + } + } +} + +} // namespace ncnn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/common.h b/compute/ncnn/src/srcn/common.h new file mode 100644 index 000000000..778a17a80 --- /dev/null +++ b/compute/ncnn/src/srcn/common.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_COMMON_H__ +#define __NNFW_SRCN_COMMON_H__ + +#include <string.h> +#include <limits> +#include <arm_neon.h> + +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +#define sizeof_RhsScalar 4 +#define sizeof_LhsScalar 4 +#define sizeof_ResScalar 4 + +#define MIN(a, b) (a) > (b) ? (b) : (a) +#define MAX(a, b) (a) > (b) ? (a) : (b) + +enum shardType_t +{ + shardByCol = 0, + shardByRow +}; + +#ifdef TIZEN +#define L1_CACHE_SIZE (16536 * 2) +#define L2_CACHE_SIZE (524288 * 2) +#define L3_CACHE_SIZE (0) // no L3 +#define MAX_K (512) +// single-thread +#define GEN_COL (1440) +// multi-threads +#define MAX_COL (90) +#define MIN_COL (32) +#elif defined ANDROID +#define L1_CACHE_SIZE (16536 * 4) +#define L2_CACHE_SIZE (524288 * 8) +#define L3_CACHE_SIZE (0) //(524288 * 8) //no L3 +#define MAX_K (512 * 2) +// single-thread +#define GEN_COL (1440) +// multi-threads +#if __aarch64__ +#define MAX_COL (1024) +#else +#define MAX_COL (90) +#endif +#define MIN_COL (32) +#endif + +enum +{ + USE_COMMON_KENEL = 0, + USE_12BIT_KERNEL, + USE_NONZERO_KERENL +}; + +template <typename T> static T divup(const T &x, const T &y) +{ + return static_cast<T>((x + y - 1) / y); +} + +#ifdef NCNN +static inline size_t alignSize(size_t sz, int n) { return (sz + n - 1) / n * n; } + +static inline size_t alignBy2(size_t sz) { return (sz + 1) & -2; } +#endif // NCNN + +static inline int32_t BitNot(int32_t a) { return ~a; } + +static inline int32_t MaskIfNonZero(int32_t a) +{ + static int32_t zero = 0; + return a ? BitNot(zero) : zero; +} + +static inline int32_t BitAnd(int32_t a, int32_t b) { return a & b; } + +static inline int32_t ShiftRight(int32_t a, int offset) { return a >> offset; } + +static inline int32_t MaskIfLessThan(int32_t a, int32_t b) { return MaskIfNonZero(a < b); } + +static inline int32_t MaskIfGreaterThan(int32_t a, int32_t b) { return MaskIfNonZero(a > b); } + +static inline int32_t Add(int32_t a, int32_t b) { return a + b; } + +static inline int32_t RoundingDivideByPOT(int32_t x, int exponent) +{ + const int32_t mask = (1ll << exponent) - 1; + const int32_t zero = 0; + const int32_t one = 1; + const int32_t remainder = BitAnd(x, mask); + const int32_t threshold = Add(ShiftRight(mask, 1), BitAnd(MaskIfLessThan(x, zero), one)); + return Add(ShiftRight(x, exponent), BitAnd(MaskIfGreaterThan(remainder, threshold), one)); +} +static inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) +{ + bool overflow = a == b && a == std::numeric_limits<int32_t>::min(); + int64_t a_64(a); + int64_t b_64(b); + int64_t ab_64 = a_64 * b_64; + int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30)); + int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31)); + return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32; +} + +static inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, + int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return RoundingDivideByPOT( + SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift); +} + +static inline int32x4_t SaturatingRoundingDoublingHighMulV(int32x4_t a, int32x4_t b) +{ + return vqrdmulhq_s32(a, b); +} + +static inline int32x4_t RoundingDivideByPOTV(int32x4_t x, int exponent) +{ + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +static inline int32x4_t MultiplyByQuantizedMultiplierV(int32x4_t x, int32_t quantized_multiplier, + int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return RoundingDivideByPOTV( + SaturatingRoundingDoublingHighMulV(vrshlq_s32(x, vdupq_n_s32(left_shift)), + vdupq_n_s32(quantized_multiplier)), + right_shift); +} + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_COMMON_H__ diff --git a/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc new file mode 100644 index 000000000..21083f677 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.cc @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "ncnn/srcn/conv_type.h" +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "conv_sgemm_multithreads.h" + +namespace nnfw +{ +namespace srcn +{ + +void conv_sgemm_multithreads::param_init() +{ +#if __aarch64__ + if (conv_type_ == row_major) + { + mr_ = 8; + nr_ = 12; + } + else if (conv_type_ == col_major) + { +#ifdef BATCH_DILATION_FIX + if (out_mat_.n > 1) + { + + mr_ = 24; + nr_ = 4; + } + else +#endif // BATCH_DILATION_FIX + { + if (m_ > n_) + { + mr_ = 24; + nr_ = 4; + } + else + { + mr_ = 12; + nr_ = 8; + } + } + } +#else // __aarch64__ + if (conv_type_ == row_major) + { + mr_ = 6; + nr_ = 8; + } + else if (conv_type_ == col_major) + { + mr_ = 8; + nr_ = 6; + } +#endif // __aarch64__ + int col = n_; + + if (m_ > n_) + { + shard_type_ = shardByRow; + col = m_; + } + else + { + shard_type_ = shardByCol; + } + + int th_base = divup(col, num_threads_); + + th_base = MIN(MAX(th_base, MIN_COL), MAX_COL); + + int k_div = (nr_ * sizeof_RhsScalar); + int k_sub = (mr_ * nr_ * sizeof_ResScalar); + + const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K); + bk_ = MIN(k_cache, k_); + + if (shard_type_ == shardByCol) + { + int m_sub = (bk_ * nr_ * sizeof_RhsScalar); + int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + if (L3_CACHE_SIZE) + m_div = (sizeof_LhsScalar * bk_ * 2); + int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div); + bm_ = MIN(m_cache, m_); + + bn_ = MIN(th_base, n_); + if (L3_CACHE_SIZE) + { + int n_sub = (bk_ * bm_ * sizeof_RhsScalar); + int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div); + bn_ = MIN(n_cache, bn_); + } + } + else + { + int n_sub = (bk_ * mr_ * sizeof_LhsScalar); + int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + if (L3_CACHE_SIZE) + n_div = (sizeof_LhsScalar * bk_ * 2); + int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div); + bn_ = MIN(n_cache, n_); + + bm_ = MIN(th_base, m_); + if (L3_CACHE_SIZE) + { + int m_sub = (bk_ * bn_ * sizeof_RhsScalar); + int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div); + bm_ = MIN(m_cache, bm_); + } + } + + nm_ = divup(m_, bm_); + nn_ = divup(n_, bn_); + nk_ = divup(k_, bk_); + + rm_ = m_ % bm_; + rn_ = n_ % bn_; + rk_ = k_ % bk_; +} + +conv_sgemm_multithreads::conv_sgemm_multithreads(const convMat_t &in_mat, + const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, int num_threads, + convType_t conv_type) + + : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param), + conv_type_(conv_type), num_threads_(num_threads) +{ + m_ = out_mat_.c; +#ifdef NCNN +#ifdef WITH_DPU + np_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float)); + n_ = (np_ + 1) / 2; +#else // WITH_DPU + n_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float)); +#endif // WITH_DPU +#else // NCNN +#ifdef WITH_DPU + np_ = out_mat_.n * out_mat_.w * out_mat_.h; + n_ = (np_ + 1) / 2; +#else // WITH_DPU + n_ = out_mat_.n * out_mat_.w * out_mat_.h; +#endif // WITH_DPU +#endif // NCNN + k_ = in_param_.kernel_h * in_param_.kernel_w * in_mat.c; + + param_init(); + + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + if (shard_type_ == shardByCol) + { + plhs_buffer_ = new float[lhs_stride * 1 * nm_]; + prhs_buffer_ = new float[rhs_stride * num_threads_]; + } + else + { + plhs_buffer_ = new float[lhs_stride * num_threads_]; + prhs_buffer_ = new float[rhs_stride * 1 * nn_]; + } + + if (plhs_buffer_ == NULL || prhs_buffer_ == NULL) + { + error_ = 1; + } + + if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || + in_param_.stride_h != 1 || in_param_.padding != 0) + { + need_im2col_ = 1; + } + else + { + need_im2col_ = 0; + } + + omp_set_num_threads(num_threads_); + + error_ = 0; +} + +conv_sgemm_multithreads::~conv_sgemm_multithreads() +{ + if (plhs_buffer_) + delete[] plhs_buffer_; + if (prhs_buffer_) + delete[] prhs_buffer_; +} + +void conv_sgemm_multithreads::run() +{ + if (error_) + return; + + if (shard_type_ == shardByCol && conv_type_ == col_major) + { + compute_colmajor_colshard(); + } + else if (shard_type_ == shardByRow && conv_type_ == col_major) + { + compute_colmajor_rowshard(); + } + else if (shard_type_ == shardByCol && conv_type_ == row_major) + { + compute_rowmajor_colshard(); + } + else if (shard_type_ == shardByRow && conv_type_ == row_major) + { + compute_rowmajor_rowshard(); + } +} + +void conv_sgemm_multithreads::compute_rowmajor_colshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + &plhs_buffer_[i * lhs_stride]); + } + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int thread_num = omp_get_thread_num(); + // float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; + float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; + + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { +#ifdef WITH_DPU + _pack_rowmajor_notrans_rhs(nr_, bn, bk, np_, &in_mat_.data[n_ + l * bk_ * np_ + j * bn_], + prhs_ptr); +#else + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], + prhs_ptr); +#endif + } + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + +#ifdef WITH_DPU + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], + prhs_ptr, &out_mat_.data[n_ + i * bm_ * np_ + j * bn_], + l, np_, bk); +#else // WITH_DPU + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], + prhs_ptr, &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, + bk); +#endif // WITH_DPU + } + } + } +} + +void conv_sgemm_multithreads::compute_rowmajor_rowshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + &prhs_buffer_[j * rhs_stride]); + } + else + { + _pack_rowmajor_image_rhs_batch( + nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), &prhs_buffer_[j * rhs_stride]); + } + } + else + { + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], + &prhs_buffer_[j * rhs_stride]); + } + } + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + int thread_num = omp_get_thread_num(); + float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; + + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, + &prhs_buffer_[j * rhs_stride], + &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } +} + +void conv_sgemm_multithreads::compute_colmajor_colshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + &plhs_buffer_[i * lhs_stride]); + } + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int thread_num = omp_get_thread_num(); + float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; + + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], + prhs_ptr); + } + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], + prhs_ptr, &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, + bk); + } + } + } +} + +void conv_sgemm_multithreads::compute_colmajor_rowshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + &prhs_buffer_[j * rhs_stride]); + } + else + { + _pack_colmajor_image_rhs_batch( + nr_, bn, bk, l * bk_, j * bn_, const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), &prhs_buffer_[j * rhs_stride]); + } + } + else + { + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], + &prhs_buffer_[j * rhs_stride]); + } + } + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + int thread_num = omp_get_thread_num(); + float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; + + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, + &prhs_buffer_[j * rhs_stride], + &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/conv_sgemm_multithreads.h b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h new file mode 100644 index 000000000..9c9ce7437 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sgemm_multithreads.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__ +#define __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__ + +#include "ncnn/srcn/conv_type.h" +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +class conv_sgemm_multithreads +{ +public: + conv_sgemm_multithreads(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, int num_threads, convType_t conv_type); + ~conv_sgemm_multithreads(); + + void run(); + +private: + void param_init(); + + void compute_rowmajor_colshard(); + void compute_rowmajor_rowshard(); + void compute_colmajor_colshard(); + void compute_colmajor_rowshard(); + + const convMat_t in_mat_; + const convMat_t weights_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + convType_t conv_type_; + int num_threads_; + + int m_; + int n_; +#ifdef WITH_DPU + int np_; +#endif + int k_; + + int bm_; + int bn_; + int bk_; + + int rm_; + int rn_; + int rk_; + + int nm_; + int nn_; + int nk_; + + int mr_; + int nr_; + + int need_im2col_; + shardType_t shard_type_; + + float *prhs_buffer_; + float *plhs_buffer_; + + int error_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_SGEMM_MULTITHREADS_H__ diff --git a/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc b/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc new file mode 100644 index 000000000..4cbbf217f --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sgemm_singlethread.cc @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdexcept> + +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "conv_sgemm_singlethread.h" + +namespace nnfw +{ +namespace srcn +{ + +void conv_sgemm_singlethread::param_init() +{ + if (n_ > 3 * m_) + { + shard_type_ = shardByRow; + } + else + { + shard_type_ = shardByCol; + } + +#if __aarch64__ + if (conv_type_ == row_major) + { + if (shard_type_ == shardByRow) + { + mr_ = 8; + nr_ = 12; + } + else + { + mr_ = 12; + nr_ = 8; + } + } + else if (conv_type_ == col_major) + { +#ifndef BATCH_DILATION_FIX + mr_ = 12; + nr_ = 8; +#else // BATCH_DILATION_FIX + // TODO: batch(dilation) + inw * inh + if (out_mat_.n > 1) + { + mr_ = 24; + nr_ = 4; + } + else + { + mr_ = 12; + nr_ = 8; + } +#endif // BATCH_DILATION_FIX + } +#else // __aarch64__ + if (conv_type_ == row_major) + { + mr_ = 6; + nr_ = 8; + } + else if (conv_type_ == col_major) + { + mr_ = 8; + nr_ = 6; + } +#endif // __aarch64__ + + int k_div = (nr_ * sizeof_RhsScalar); + int k_sub = (mr_ * nr_ * sizeof_ResScalar); + + const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div), MAX_K); + bk_ = MIN(k_cache, k_); + + if (shard_type_ == shardByCol) + { + int m_sub = (bk_ * nr_ * sizeof_RhsScalar); + int m_cache = divup((L2_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2)); + bm_ = MIN(m_cache, m_); + + bn_ = MIN(GEN_COL, n_); + if (L3_CACHE_SIZE) + { + int n_sub = (bk_ * bm_ * sizeof_RhsScalar); + int n_cache = divup((L3_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2)); + bn_ = MIN(n_cache, bn_); + } + } + else + { + int n_sub = (bk_ * mr_ * sizeof_RhsScalar); + int n_cache = divup((L2_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2)); + bn_ = MIN(n_cache, n_); + + bm_ = MIN(GEN_COL, m_); + if (L3_CACHE_SIZE) + { + int m_sub = (bk_ * bn_ * sizeof_RhsScalar); + int m_cache = divup((L3_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2)); + bm_ = MIN(m_cache, bm_); + } + } + + nm_ = divup(m_, bm_); + nn_ = divup(n_, bn_); + nk_ = divup(k_, bk_); + + rm_ = m_ % bm_; + rn_ = n_ % bn_; + rk_ = k_ % bk_; +} + +conv_sgemm_singlethread::conv_sgemm_singlethread(const convMat_t &in_mat, + const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, convType_t conv_type) + : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param), + conv_type_(conv_type) +{ + m_ = out_mat_.c; +#ifdef NCNN + n_ = out_mat_.n * alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float)); +#else + n_ = out_mat_.n * out_mat_.w * out_mat_.h; +#endif + k_ = in_param_.kernel_h * in_param_.kernel_w * in_mat.c; + + param_init(); + + if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || + in_param_.stride_h != 1 || in_param_.padding != 0 || out_mat_.n > 1) + { + need_im2col_ = 1; + } + else + { + need_im2col_ = 0; + } +} + +conv_sgemm_singlethread::~conv_sgemm_singlethread() {} + +void conv_sgemm_singlethread::run() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float *plhs_ptr = new float[mstride * bk_]; + float *prhs_ptr = new float[nstride * bk_]; + + if (conv_type_ == row_major) + { + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], + prhs_ptr); + } + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + plhs_ptr); + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_rowmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_rowmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], + prhs_ptr); + } + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &out_mat_.data[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else + { + throw std::runtime_error{"Error shrad type!"}; + } + } + else if (conv_type_ == col_major) + { + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], + prhs_ptr); + } + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + plhs_ptr); + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + if (need_im2col_) + { + if (out_mat_.n == 1) + { + _pack_colmajor_image_rhs(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + else + { + _pack_colmajor_image_rhs_batch(nr_, bn, bk, l * bk_, j * bn_, + const_cast<convMat_t *>(&in_mat_), &out_mat_, + const_cast<convParams_t *>(&in_param_), prhs_ptr); + } + } + else + { + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], + prhs_ptr); + } + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &out_mat_.data[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else + { + throw std::runtime_error{"Error shrad type!"}; + } + } + else + { + throw std::runtime_error{"Error conv type!"}; + } + + delete[] plhs_ptr; + delete[] prhs_ptr; +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/conv_sgemm_singlethread.h b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h new file mode 100644 index 000000000..63f8b6e66 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sgemm_singlethread.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__ +#define __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__ + +#include "ncnn/srcn/conv_type.h" +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +class conv_sgemm_singlethread +{ +public: + conv_sgemm_singlethread(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, convType_t conv_type); + ~conv_sgemm_singlethread(); + + void run(); + +private: + void param_init(); + + const convMat_t in_mat_; + const convMat_t weights_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + convType_t conv_type_; + + int m_; + int n_; + int k_; + + int bm_; + int bn_; + int bk_; + + int rm_; + int rn_; + int rk_; + + int nm_; + int nn_; + int nk_; + + int mr_; + int nr_; + + int need_im2col_; + + shardType_t shard_type_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_SGEMM_SINGLETHREAD_H__ diff --git a/compute/ncnn/src/srcn/conv_sparse.cc b/compute/ncnn/src/srcn/conv_sparse.cc new file mode 100644 index 000000000..10e2a2b93 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sparse.cc @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <stdexcept> + +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "conv_sparse.h" + +namespace nnfw +{ +namespace srcn +{ + +void conv_sparse::param_init() +{ +#ifdef NCNN + n_ = alignSize(out_mat_.h * out_mat_.w, 16 / sizeof(float)); +#else + n_ = out_mat_.w * out_mat_.h; +#endif + + bch_ = BCH; + nch_ = (out_mat_.c + bch_ - 1) / bch_; + + rch_ = out_mat_.c % bch_; + + bn_ = MIN(n_, L1_CACHE_SIZE / (sizeof(float) * 2)); + bn_ = MIN(bn_, (L2_CACHE_SIZE / 2 - bch_ * sizeof(weight_data_t)) / ((bch_ + 1) * sizeof(float)) / + num_threads_); + nn_ = (n_ + bn_ - 1) / bn_; + rn_ = n_ % bn_; + + if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || + in_param_.stride_h != 1 || in_param_.padding != 0) + { + need_im2col_ = 1; + } + else + { + need_im2col_ = 0; + } +} + +conv_sparse::conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param, + const sparse_weight_t *weights, int num_threads, convType_t conv_type) + : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), weights_(weights), + num_threads_(num_threads), conv_type_(conv_type) +{ + param_init(); +} + +conv_sparse::~conv_sparse() {} + +void conv_sparse::compute_singlethread() +{ + if (need_im2col_) + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + float prhs_ptr[bn_]; + + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + // Why n_ = 64 x 64 is too much slower on Tizen??? + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + // Why n_ = 64 x 64 is too much slower on Tizen??? + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } +} + +void conv_sparse::compute_multithreads() +{ + omp_set_num_threads(num_threads_); + + if (nch_ >= num_threads_ || nch_ >= nn_) + { + if (need_im2col_) + { +#pragma omp parallel for + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + float prhs_ptr[bn_]; + + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { +#pragma omp parallel for + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + } + else + { + if (need_im2col_) + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int k = -1; + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float prhs_ptr[bn]; + + for (int l = 0; l < mxk; l++) + { + if (k != lhs_ptr->k) + { + k = lhs_ptr->k; + _sparse_pack_rowmajor_image(bn, k, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), + prhs_ptr); + } + + _sparse_sgemm_kernel(bn, lhs_ptr->data, prhs_ptr, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + else + { + for (int i = 0; i < nch_; i++) + { + const sparse_weight_t *weight_ptr = weights_ + i; + const int mxk = weight_ptr->mxk; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + weight_data_t *lhs_ptr = weight_ptr->wdata; + float *rhs_ptr = in_mat_.data + j * bn_; + + for (int l = 0; l < mxk; l++) + { + _sparse_sgemm_kernel(bn, lhs_ptr->data, rhs_ptr + lhs_ptr->k * n_, + &out_mat_.data[lhs_ptr->m * n_ + j * bn_]); + + lhs_ptr++; + } + } + } + } + } +} + +void conv_sparse::run() +{ + if (num_threads_ == 1) + compute_singlethread(); + else if (num_threads_ > 1) + compute_multithreads(); + else + throw std::runtime_error{"Invalid thread number."}; +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/conv_sparse.h b/compute/ncnn/src/srcn/conv_sparse.h new file mode 100644 index 000000000..7ac358fd8 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_sparse.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_SPARSE_H__ +#define __NNFW_SRCN_CONV_SPARSE_H__ + +#include "ncnn/srcn/conv_type.h" +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +#define BCH 128 + +typedef struct +{ + short m; + short k; + float data; +} weight_data_t; + +typedef struct +{ + int mxk; + weight_data_t *wdata; +} sparse_weight_t; + +class conv_sparse +{ +public: + conv_sparse(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param, + const sparse_weight_t *weights, int num_threads, convType_t conv_type); + ~conv_sparse(); + + void run(); + +private: + void param_init(); + void compute_singlethread(); + void compute_multithreads(); + + const convMat_t in_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + const sparse_weight_t *weights_; + int num_threads_; + convType_t conv_type_; + + uint32_t n_; + uint32_t bn_; + int rn_; + int nn_; + + int bch_; + int rch_; + int nch_; + + int need_im2col_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_SPARSE_H__ diff --git a/compute/ncnn/src/srcn/conv_winograd.cc b/compute/ncnn/src/srcn/conv_winograd.cc new file mode 100644 index 000000000..69649ea2a --- /dev/null +++ b/compute/ncnn/src/srcn/conv_winograd.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.h" +#include "conv_winograd.h" + +namespace std +{ +template <typename Dtype> static inline Dtype max(Dtype a, Dtype b) +{ + if (a > b) + return a; + else + return b; +} +} + +namespace nnfw +{ +namespace srcn +{ + +void conv_winograd::param_init() +{ + if ((in_param_.kernel_w != in_param_.kernel_h) || (in_param_.stride_w != in_param_.stride_h) || + (in_param_.kernel_w != 3 && in_param_.kernel_w != 5) || (in_param_.stride_w != 1) || + (!winograd_weight_)) + { + error_ = 1; + return; + } + + int M, N; + const int w = in_mat_.w; + const int h = in_mat_.h; + const int outw = out_mat_.w; + const int outh = out_mat_.h; + const int pad_w = in_param_.pad_w; + const int pad_h = in_param_.pad_h; + + if (in_param_.kernel_w == 3) + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + } + + tile_h_in_ = tile_w_in_ = M; + tile_h_out_ = tile_h_in_ - N + 1; + tile_w_out_ = tile_w_in_ - N + 1; + ntiles_h_ = (std::max(h + pad_h - tile_h_in_ + 1, outh) + tile_h_out_ - 1) / tile_h_out_; + ntiles_w_ = (std::max(w + pad_w - tile_w_in_ + 1, outw) + tile_w_out_ - 1) / tile_w_out_; + + error_ = 0; +} + +conv_winograd::conv_winograd(const convMat_t &in_mat, convMat_t &out_mat, + const convParams_t &in_param, convType_t conv_type, + const float *winograd_weight, int num_threads, int inc_stride, + int outc_stride, int c_stride) + : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), conv_type_(conv_type), + winograd_weight_(winograd_weight), num_threads_(num_threads), inc_stride_(inc_stride), + outc_stride_(outc_stride), c_stride_(c_stride) + +{ + param_init(); +} + +conv_winograd::~conv_winograd() {} + +void conv_winograd::compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, + const int m, const int n, const int k, const float *lhs_data, + const float *rhs_data, float *res_data) +{ + class sgemm_singlethread sgemm(major_type, ltrans, rtrans, m, n, k, lhs_data, rhs_data, res_data, + num_threads_); + + sgemm.run(); +} + +void conv_winograd::winograd_input_im2col(float *col_buff) +{ + const int w = in_mat_.w; + const int h = in_mat_.h; + const float *data = in_mat_.data; + const int channels = in_mat_.c; + const int pad_w = in_param_.pad_w; + const int pad_h = in_param_.pad_h; + + if (conv_type_ == row_major) + { +#ifdef NCNN + const int n = alignSize(inc_stride_, 16 / sizeof(float)); +#else // NCNN + const int n = inc_stride_; +#endif // NCNN + for (int c = 0; c < channels; ++c) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_in_; ++y) + { + for (int x = 0; x < tile_w_in_; ++x) + { + int in_y = tile_h * tile_h_out_ + y - pad_h; + int in_x = tile_w * tile_w_out_ + x - pad_w; + + if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w) + { + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) * + tile_w_in_ + + x] = 0; + } + else + { + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) * + tile_w_in_ + + x] = data[c * n + in_y * w + in_x]; + } + } + } + } + } + } + } + else if (conv_type_ == col_major) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_in_; ++y) + { + for (int x = 0; x < tile_w_in_; ++x) + { + for (int c = 0; c < channels; ++c) + { + int in_y = tile_h * tile_h_out_ + y - pad_h; + int in_x = tile_w * tile_w_out_ + x - pad_w; + + if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w) + { + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) * + tile_w_in_ + + x] = 0; + } + else + { + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_in_ + y) * + tile_w_in_ + + x] = data[c + (in_y * w + in_x) * channels]; + } + } + } + } + } + } + } +} + +void conv_winograd::winograd_output_col2im(const float *col_buff) +{ + int outh = out_mat_.h; + int outw = out_mat_.w; + float *data = out_mat_.data; + int channels = out_mat_.c; + + if (conv_type_ == row_major) + { +#ifdef NCNN + const int n = alignSize(outc_stride_, 16 / sizeof(float)); +#else // NCNN + const int n = outc_stride_; +#endif // NCNN + for (int c = 0; c < channels; ++c) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_out_; ++y) + { + for (int x = 0; x < tile_w_out_; ++x) + { + int out_y = tile_h * tile_h_out_ + y; + int out_x = tile_w * tile_w_out_ + x; + if (out_y < outh && out_x < outw) + { + data[c * n + out_y * outw + out_x] = + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_out_ + y) * + tile_w_out_ + + x]; + } + } + } + } + } + } + } + else if (conv_type_ == col_major) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_out_; ++y) + { + for (int x = 0; x < tile_w_out_; ++x) + { + for (int c = 0; c < channels; ++c) + { + int out_y = tile_h * tile_h_out_ + y; + int out_x = tile_w * tile_w_out_ + x; + if (out_y < outh && out_x < outw) + { + data[c + (out_y * outw + out_x) * c_stride_] = + col_buff[(((c * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * tile_h_out_ + y) * + tile_w_out_ + + x]; + } + } + } + } + } + } + } +} + +void conv_winograd::compute_winograd() +{ + // const int w = in_mat_.w; + // const int h = in_mat_.h; + const int inch = in_mat_.c; + // const int outw = out_mat_.w; + // const int outh = out_mat_.h; + const int outch = out_mat_.c; + const int kernel_size = in_param_.kernel_w; + + int M, N; + const double *A; + const double *B; + + if (kernel_size == 3) + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + B = winograd_para_3x3s1::getB(); + A = winograd_para_3x3s1::getA(); + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + B = winograd_para_5x5s1::getB(); + A = winograd_para_5x5s1::getA(); + } + + /*Step 2: transfer image to winograd domain*/ + float *col_buff = + new float[std::max(outch, inch) * ntiles_h_ * ntiles_w_ * tile_h_in_ * tile_w_in_]; + + int temp1_n = inch * ntiles_h_ * ntiles_w_; + float *temp1_ = + new float[tile_h_in_ * tile_w_in_ * std::max(outch, inch) * ntiles_h_ * ntiles_w_]; + + float *winograd_b = new float[M * M * M * M]; + + if ((NULL == col_buff) || (NULL == temp1_) || (NULL == winograd_b)) + { + delete[] col_buff; + delete[] temp1_; + delete[] winograd_b; + return; + } + + winograd_input_im2col(col_buff); + + kronecker_product(winograd_b, B, B, M, M, M, M); + + compute_sgemm(rowMajor, trans, trans, tile_h_in_ * tile_w_in_, temp1_n, tile_h_in_ * tile_w_in_, + winograd_b, col_buff, temp1_); + + delete[] winograd_b; + + /*Step 3: convolution in winograd domain*/ + for (int j = 0; j < tile_h_in_ * tile_w_in_; ++j) + { + compute_sgemm(rowMajor, notrans, notrans, outch, ntiles_h_ * ntiles_w_, inch, + winograd_weight_ + j * c_stride_ * inch, + temp1_ + j * inch * ntiles_h_ * ntiles_w_, + col_buff + j * outch * ntiles_h_ * ntiles_w_); + } + + /*Step 4: transfer back to time domain*/ + float *winograd_a = new float[M * (M - N + 1) * M * (M - N + 1)]; + if (NULL == winograd_a) + { + delete[] col_buff; + delete[] temp1_; + return; + } + kronecker_product(winograd_a, A, A, M, M - N + 1, M, M - N + 1); + compute_sgemm(rowMajor, trans, notrans, outch * ntiles_h_ * ntiles_w_, tile_h_out_ * tile_w_out_, + tile_h_in_ * tile_w_in_, col_buff, winograd_a, temp1_); + delete[] winograd_a; + delete[] col_buff; + + winograd_output_col2im(temp1_); + + delete[] temp1_; +} + +void conv_winograd::run() +{ + if (error_) + return; + + compute_winograd(); +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/conv_winograd.h b/compute/ncnn/src/srcn/conv_winograd.h new file mode 100644 index 000000000..76c2601f2 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_winograd.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_WINOGRAD_H__ +#define __NNFW_SRCN_CONV_WINOGRAD_H__ + +#include "ncnn/srcn/conv_type.h" +#include "winograd.h" +#include "sgemm_singlethread.h" + +namespace nnfw +{ +namespace srcn +{ + +class conv_winograd +{ +public: + conv_winograd(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param, + convType_t conv_type, const float *winograd_weight, int num_threads, int inc_stride, + int outc_stride, int c_stride); + ~conv_winograd(); + + void run(); + +private: + void param_init(); + void compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m, + const int n, const int k, const float *lhs_data, const float *rhs_data, + float *res_data); + void winograd_input_im2col(float *col_buff); + void winograd_output_col2im(const float *col_buff); + void compute_winograd(); + + const convMat_t in_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + convType_t conv_type_; + const float *winograd_weight_; + const int num_threads_; + + int tile_w_in_; + int tile_h_in_; + int tile_w_out_; + int tile_h_out_; + int ntiles_w_; + int ntiles_h_; + + int inc_stride_; + int outc_stride_; + int c_stride_; + + int error_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_WINOGRAD_H__ diff --git a/compute/ncnn/src/srcn/conv_winograd_batch.cc b/compute/ncnn/src/srcn/conv_winograd_batch.cc new file mode 100644 index 000000000..cba45c648 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_winograd_batch.cc @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.h" +#include "conv_winograd_batch.h" + +namespace std +{ +template <typename Dtype> static inline Dtype max(Dtype a, Dtype b) +{ + if (a > b) + return a; + else + return b; +} +} + +namespace nnfw +{ +namespace srcn +{ + +void conv_winograd_batch::param_init() +{ + if ((in_param_.kernel_w != in_param_.kernel_h) || (in_param_.stride_w != in_param_.stride_h) || + (in_param_.kernel_w != 3 && in_param_.kernel_w != 5) || (in_param_.stride_w != 1) || + (!winograd_weight_)) + { + error_ = 1; + return; + } + + int M, N; + const int w = in_mat_.w; + const int h = in_mat_.h; + const int outw = out_mat_.w; + const int outh = out_mat_.h; + const int pad_w = in_param_.pad_w; + const int pad_h = in_param_.pad_h; + + if (in_param_.kernel_w == 3) + { + if (w == 4) + { + M = winograd_para_3x3s1_2::M; + N = winograd_para_3x3s1_2::N; + } + else + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + } + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + } + + tile_h_in_ = tile_w_in_ = M; + tile_h_out_ = tile_h_in_ - N + 1; + tile_w_out_ = tile_w_in_ - N + 1; + ntiles_h_ = (std::max(h + pad_h - tile_h_in_ + 1, outh) + tile_h_out_ - 1) / tile_h_out_; + ntiles_w_ = (std::max(w + pad_w - tile_w_in_ + 1, outw) + tile_w_out_ - 1) / tile_w_out_; + + error_ = 0; +} + +conv_winograd_batch::conv_winograd_batch(const convMat_t &in_mat, convMat_t &out_mat, + const convParams_t &in_param, convType_t conv_type, + const float *winograd_weight, int num_threads) + : in_mat_(in_mat), out_mat_(out_mat), in_param_(in_param), conv_type_(conv_type), + winograd_weight_(winograd_weight), num_threads_(num_threads) +{ + param_init(); +} + +conv_winograd_batch::~conv_winograd_batch() {} + +void conv_winograd_batch::compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, + sgemmTrans_t rtrans, const int m, const int n, const int k, + const float *lhs_data, const float *rhs_data, + float *res_data) +{ + class sgemm_singlethread sgemm(major_type, ltrans, rtrans, m, n, k, lhs_data, rhs_data, res_data, + num_threads_); + + sgemm.run(); +} + +void conv_winograd_batch::winograd_input_im2col(float *col_buff) +{ + const int w = in_mat_.w; + const int h = in_mat_.h; + const float *data = in_mat_.data; + const int channels = in_mat_.c; + const int batch = in_mat_.n; + const int pad_w = in_param_.pad_w; + const int pad_h = in_param_.pad_h; + + // TODO: row_major + if (conv_type_ == col_major) + { + for (int n = 0; n < batch; n++) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_in_; ++y) + { + for (int x = 0; x < tile_w_in_; ++x) + { + for (int c = 0; c < channels; ++c) + { + int in_y = tile_h * tile_h_out_ + y - pad_h; + int in_x = tile_w * tile_w_out_ + x - pad_w; + + if (in_y < 0 || in_x < 0 || in_y >= h || in_x >= w) + { + col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * + tile_h_in_ + + y) * + tile_w_in_ + + x] = 0; + } + else + { + col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * + tile_h_in_ + + y) * + tile_w_in_ + + x] = data[((n * h + in_y) * w + in_x) * channels + c]; + } + } + } + } + } + } + } + } +} + +void conv_winograd_batch::winograd_output_col2im(const float *col_buff) +{ + int outh = out_mat_.h; + int outw = out_mat_.w; + float *data = out_mat_.data; + int channels = out_mat_.c; + int batch = out_mat_.n; + + // TODO: row_major + if (conv_type_ == col_major) + { + for (int n = 0; n < batch; n++) + { + for (int tile_h = 0; tile_h < ntiles_h_; ++tile_h) + { + for (int tile_w = 0; tile_w < ntiles_w_; ++tile_w) + { + for (int y = 0; y < tile_h_out_; ++y) + { + for (int x = 0; x < tile_w_out_; ++x) + { + for (int c = 0; c < channels; ++c) + { + int out_y = tile_h * tile_h_out_ + y; + int out_x = tile_w * tile_w_out_ + x; + if (out_y < outh && out_x < outw) + { + data[((n * outh + out_y) * outw + out_x) * channels + c] = + col_buff[((((c * batch + n) * ntiles_h_ + tile_h) * ntiles_w_ + tile_w) * + tile_h_out_ + + y) * + tile_w_out_ + + x]; + } + } + } + } + } + } + } + } +} + +void conv_winograd_batch::compute_winograd() +{ + const int w = in_mat_.w; + // const int h = in_mat_.h; + const int inch = in_mat_.c; + // const int outw = out_mat_.w; + // const int outh = out_mat_.h; + const int outch = out_mat_.c; + const int kernel_size = in_param_.kernel_w; + const int batch = in_mat_.n; + + int M, N; + const double *A; + const double *B; + + if (kernel_size == 3) + { + if (w == 4) + { + M = winograd_para_3x3s1_2::M; + N = winograd_para_3x3s1_2::N; + B = winograd_para_3x3s1_2::getB(); + A = winograd_para_3x3s1_2::getA(); + } + else + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + B = winograd_para_3x3s1::getB(); + A = winograd_para_3x3s1::getA(); + } + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + B = winograd_para_5x5s1::getB(); + A = winograd_para_5x5s1::getA(); + } + + /*Step 2: transfer image to winograd domain*/ + float *col_buff = + new float[std::max(outch, inch) * batch * ntiles_h_ * ntiles_w_ * tile_h_in_ * tile_w_in_]; + + int temp1_n = batch * inch * ntiles_h_ * ntiles_w_; + float *temp1_ = + new float[batch * tile_h_in_ * tile_w_in_ * std::max(outch, inch) * ntiles_h_ * ntiles_w_]; + + float *winograd_b = new float[M * M * M * M]; + + if ((NULL == col_buff) || (NULL == temp1_) || (NULL == winograd_b)) + { + delete[] col_buff; + delete[] temp1_; + delete[] winograd_b; + return; + } + + winograd_input_im2col(col_buff); + + kronecker_product(winograd_b, B, B, M, M, M, M); + + compute_sgemm(rowMajor, trans, trans, tile_h_in_ * tile_w_in_, temp1_n, tile_h_in_ * tile_w_in_, + winograd_b, col_buff, temp1_); + delete[] winograd_b; + + /*Step 3: convolution in winograd domain*/ + for (int j = 0; j < tile_h_in_ * tile_w_in_; ++j) + { + compute_sgemm(rowMajor, notrans, notrans, outch, batch * ntiles_h_ * ntiles_w_, inch, + winograd_weight_ + j * outch * inch, + temp1_ + j * batch * inch * ntiles_h_ * ntiles_w_, + col_buff + j * batch * outch * ntiles_h_ * ntiles_w_); + } + + /*Step 4: transfer back to time domain*/ + float *winograd_a = new float[M * (M - N + 1) * M * (M - N + 1)]; + if (NULL == winograd_a) + { + delete[] col_buff; + delete[] temp1_; + return; + } + + kronecker_product(winograd_a, A, A, M, M - N + 1, M, M - N + 1); + compute_sgemm(rowMajor, trans, notrans, batch * outch * ntiles_h_ * ntiles_w_, + tile_h_out_ * tile_w_out_, tile_h_in_ * tile_w_in_, col_buff, winograd_a, temp1_); + delete[] winograd_a; + delete[] col_buff; + + winograd_output_col2im(temp1_); + + delete[] temp1_; +} + +void conv_winograd_batch::run() +{ + if (error_) + return; + + compute_winograd(); +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/conv_winograd_batch.h b/compute/ncnn/src/srcn/conv_winograd_batch.h new file mode 100644 index 000000000..a022d9c52 --- /dev/null +++ b/compute/ncnn/src/srcn/conv_winograd_batch.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__ +#define __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__ + +#include "ncnn/srcn/conv_type.h" +#include "winograd.h" +#include "sgemm_singlethread.h" + +namespace nnfw +{ +namespace srcn +{ + +class conv_winograd_batch +{ +public: + conv_winograd_batch(const convMat_t &in_mat, convMat_t &out_mat, const convParams_t &in_param, + convType_t conv_type, const float *winograd_weight, int num_threads); + ~conv_winograd_batch(); + + void run(); + +private: + void param_init(); + void compute_sgemm(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m, + const int n, const int k, const float *lhs_data, const float *rhs_data, + float *res_data); + void winograd_input_im2col(float *col_buff); + void winograd_output_col2im(const float *col_buff); + void compute_winograd(); + + const convMat_t in_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + convType_t conv_type_; + const float *winograd_weight_; + const int num_threads_; + + int tile_w_in_; + int tile_h_in_; + int tile_w_out_; + int tile_h_out_; + int ntiles_w_; + int ntiles_h_; + + int error_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_CONV_WINOGRAD_BATCH_H__ diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc new file mode 100644 index 000000000..f3ccf13e5 --- /dev/null +++ b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.cc @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "deconv_sgemm_multithreads.h" + +namespace nnfw +{ +namespace srcn +{ + +void deconv_sgemm_multithreads::param_init() +{ +#if __aarch64__ + if (conv_type_ == row_major) + { + mr_ = 8; + nr_ = 12; + } + else if (conv_type_ == col_major) + { + + mr_ = 12; + nr_ = 8; + } +#else // __aarch64__ + if (conv_type_ == row_major) + { + mr_ = 6; + nr_ = 8; + } + else if (conv_type_ == col_major) + { + mr_ = 8; + nr_ = 6; + } +#endif // __aarch64__ + + int col = n_; + + if (m_ > n_) + { + shard_type_ = shardByRow; + col = m_; + } + else + { + shard_type_ = shardByCol; + } + + int th_base = divup(col, num_threads_); + + th_base = MIN(MAX(th_base, MIN_COL), MAX_COL); + + int k_div = (nr_ * sizeof_RhsScalar); + int k_sub = (mr_ * nr_ * sizeof_ResScalar); + + const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div * 2), MAX_K); + bk_ = MIN(k_cache, k_); + + if (shard_type_ == shardByCol) + { + int m_sub = (bk_ * nr_ * sizeof_RhsScalar); + int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + if (L3_CACHE_SIZE) + m_div = (sizeof_LhsScalar * bk_ * 2); + int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div); + bm_ = MIN(m_cache, m_); + + bn_ = MIN(th_base, n_); + if (L3_CACHE_SIZE) + { + int n_sub = (bk_ * bm_ * sizeof_RhsScalar); + int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + int n_cache = divup((L3_CACHE_SIZE - n_sub), n_div); + bn_ = MIN(n_cache, bn_); + } + } + else + { + int n_sub = (bk_ * mr_ * sizeof_LhsScalar); + int n_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + if (L3_CACHE_SIZE) + n_div = (sizeof_LhsScalar * bk_ * 2); + int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div); + bn_ = MIN(n_cache, n_); + + bm_ = MIN(th_base, m_); + if (L3_CACHE_SIZE) + { + int m_sub = (bk_ * bn_ * sizeof_RhsScalar); + int m_div = (sizeof_LhsScalar * bk_ * 2 * num_threads_); + int m_cache = divup((L3_CACHE_SIZE - m_sub), m_div); + bm_ = MIN(m_cache, bm_); + } + } + + nm_ = divup(m_, bm_); + nn_ = divup(n_, bn_); + nk_ = divup(k_, bk_); + + rm_ = m_ % bm_; + rn_ = n_ % bn_; + rk_ = k_ % bk_; +} + +deconv_sgemm_multithreads::deconv_sgemm_multithreads(const convMat_t &in_mat, + const convMat_t &weights_mat, + convMat_t &out_mat, + const convParams_t &in_param, int num_threads, + convType_t conv_type) + + : in_mat_(in_mat), weights_mat_(weights_mat), out_mat_(out_mat), in_param_(in_param), + conv_type_(conv_type), num_threads_(num_threads) +{ + m_ = in_param_.kernel_h * in_param_.kernel_w * out_mat_.c; +#ifdef NCNN + n_ = alignSize(in_mat_.h * in_mat_.w, 16 / sizeof(float)); +#else // NCNN + n_ = in_mat_.w * in_mat_.h; +#endif // NCNN + k_ = in_mat.c; + + param_init(); + + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + if (shard_type_ == shardByCol) + { + plhs_buffer_ = new float[lhs_stride * 1 * nm_]; + prhs_buffer_ = new float[rhs_stride * num_threads_]; + } + else + { + plhs_buffer_ = new float[lhs_stride * num_threads_]; + prhs_buffer_ = new float[rhs_stride * 1 * nn_]; + } + + pres_buffer_ = new float[bm_ * bn_ * num_threads_]; + + if (plhs_buffer_ == NULL || prhs_buffer_ == NULL || pres_buffer_ == NULL) + { + error_ = 1; + } + + if (in_param_.kernel_w != 1 || in_param_.kernel_h != 1 || in_param_.stride_w != 1 || + in_param_.stride_h != 1 || in_param_.padding != 0) + { + need_col2im_ = 1; + } + else + { + need_col2im_ = 0; + } + + omp_set_num_threads(num_threads_); + + error_ = 0; +} + +deconv_sgemm_multithreads::~deconv_sgemm_multithreads() +{ + if (plhs_buffer_) + delete[] plhs_buffer_; + if (prhs_buffer_) + delete[] prhs_buffer_; + if (pres_buffer_) + delete[] pres_buffer_; +} + +void deconv_sgemm_multithreads::run() +{ + if (error_) + return; + + if (shard_type_ == shardByCol && conv_type_ == col_major) + { + compute_colmajor_colshard(); + } + else if (shard_type_ == shardByRow && conv_type_ == col_major) + { + compute_colmajor_rowshard(); + } + else if (shard_type_ == shardByCol && conv_type_ == row_major) + { + compute_rowmajor_colshard(); + } + else if (shard_type_ == shardByRow && conv_type_ == row_major) + { + compute_rowmajor_rowshard(); + } +} + +void deconv_sgemm_multithreads::compute_rowmajor_colshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + &plhs_buffer_[i * lhs_stride]); + } + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int thread_num = omp_get_thread_num(); + float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; + float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; + + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], + prhs_ptr, pres_ptr, 0, bn, bk); + + if (need_col2im_) + _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); + } + } + } +} + +void deconv_sgemm_multithreads::compute_rowmajor_rowshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &in_mat_.data[l * bk_ * n_ + j * bn_], + &prhs_buffer_[j * rhs_stride]); + } + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + int thread_num = omp_get_thread_num(); + float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; + float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; + + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &weights_mat_.data[l * bk_ * m_ + i * bm_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, + &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bn, bk); + if (need_col2im_) + _unpack_rowmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); + } + } + } +} + +void deconv_sgemm_multithreads::compute_colmajor_colshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + &plhs_buffer_[i * lhs_stride]); + } + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + int thread_num = omp_get_thread_num(); + float *prhs_ptr = &prhs_buffer_[rhs_stride * thread_num]; + float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; + + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, &plhs_buffer_[i * lhs_stride], + prhs_ptr, pres_ptr, 0, bm, bk); + + // Need to add lock? + if (need_col2im_) + _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); + } + } + } +} + +void deconv_sgemm_multithreads::compute_colmajor_rowshard() +{ + int lhs_stride = (bm_ + mr_ - 1) / mr_ * mr_ * bk_; + int rhs_stride = (bn_ + nr_ - 1) / nr_ * nr_ * bk_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + +#pragma omp parallel for + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &in_mat_.data[j * bn_ * k_ + l * bk_], + &prhs_buffer_[j * rhs_stride]); + } + +#pragma omp parallel for + for (int i = 0; i < nm_; i++) + { + int thread_num = omp_get_thread_num(); + float *plhs_ptr = &plhs_buffer_[lhs_stride * thread_num]; + float *pres_ptr = &pres_buffer_[bm_ * bn_ * thread_num]; + + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &weights_mat_.data[i * bm_ * k_ + l * bk_], + plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, + &prhs_buffer_[j * rhs_stride], pres_ptr, 0, bm, bk); + + if (need_col2im_) + _unpack_colmajor_image_res(bm, bn, i * bm_, j * bn_, const_cast<convMat_t *>(&in_mat_), + &out_mat_, const_cast<convParams_t *>(&in_param_), pres_ptr); + } + } + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h new file mode 100644 index 000000000..762f20380 --- /dev/null +++ b/compute/ncnn/src/srcn/deconv_sgemm_multithreads.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__ +#define __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__ + +#include "ncnn/srcn/conv_type.h" +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +class deconv_sgemm_multithreads +{ +public: + deconv_sgemm_multithreads(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, int num_threads, + convType_t conv_type); + ~deconv_sgemm_multithreads(); + + void run(); + +private: + void param_init(); + + void compute_rowmajor_colshard(); + void compute_rowmajor_rowshard(); + void compute_colmajor_colshard(); + void compute_colmajor_rowshard(); + + const convMat_t in_mat_; + const convMat_t weights_mat_; + convMat_t out_mat_; + const convParams_t in_param_; + convType_t conv_type_; + const int num_threads_; + + int m_; + int n_; + int k_; + + int bm_; + int bn_; + int bk_; + + int rm_; + int rn_; + int rk_; + + int nm_; + int nn_; + int nk_; + + int mr_; + int nr_; + + int need_col2im_; + shardType_t shard_type_; + + float *prhs_buffer_; + float *plhs_buffer_; + float *pres_buffer_; + + int error_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_DECONV_SGEMM_MULTITHREADS_H__ diff --git a/compute/ncnn/src/srcn/depthwise_conv.cc b/compute/ncnn/src/srcn/depthwise_conv.cc new file mode 100644 index 000000000..cd092d5ac --- /dev/null +++ b/compute/ncnn/src/srcn/depthwise_conv.cc @@ -0,0 +1,2684 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <arm_neon.h> +#include <stdlib.h> +#include <string.h> + +#include "common.h" +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +static void depthwise_conv3x3S1_nopad(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + float *in_ptr3 = inbuf + 3 * w; + + float *out_ptr0 = outbuf + 0 * outw; + float *out_ptr1 = outbuf + 1 * outw; + + int i; + for (i = 0; i + 1 < outh; i += 2) + { + int nn = (outw >> 2) - 1; + int remain = outw & 0x03; + + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + + "1:\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q2, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q2, %e[weight345][1]\n" + "vmul.f32 q12, q0, %e[weight012][0]\n" + "vmul.f32 q13, q2, %e[weight012][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr2], %[in_ptr2], #16\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q2, %e[weight678][1]\n" + "vmla.f32 q12, q0, %e[weight345][0]\n" + "vmla.f32 q13, q2, %e[weight345][1]\n" + + "pld [%[in_ptr3], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr3]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr3], %[in_ptr3], #16\n" + + "vmla.f32 q12, q0, %e[weight678][0]\n" + "vmla.f32 q13, q2, %e[weight678][1]\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vmla.f32 q15, q3, %f[weight678][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + + "bne 1b\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + + [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + float32x4_t input3 = vld1q_f32(in_ptr3); + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + float32x4_t out1 = vmulq_f32(input1, weight012); + out1 = vmlaq_f32(out1, input2, weight345); + out1 = vmlaq_f32(out1, input3, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + out1 = vsetq_lane_f32(bias0, out1, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1)); + + float32x2_t out01 = vpadd_f32(out00, out11); + + *out_ptr0 = vget_lane_f32(out01, 0); + *out_ptr1 = vget_lane_f32(out01, 1); + + in_ptr0++; + in_ptr1++; + in_ptr2++; + in_ptr3++; + out_ptr0++; + out_ptr1++; + } + + in_ptr0 += w + 2; + in_ptr1 += w + 2; + in_ptr2 += w + 2; + in_ptr3 += w + 2; + + out_ptr0 += outw; + out_ptr1 += outw; + } + + for (; i < outh; i++) + { + int nn = outw >> 2; + int remain = outw & 0x03; + + if (nn > 0) + { + __asm __volatile("1:\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q14, q0, %e[weight012][0]\n" + "vmla.f32 q14, q2, %e[weight012][1]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vmla.f32 q14, q0, %e[weight345][0]\n" + "vmla.f32 q14, q2, %e[weight345][1]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr2], %[in_ptr2], #16\n" + + "vmla.f32 q14, q0, %e[weight678][0]\n" + "vmla.f32 q14, q2, %e[weight678][1]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + + "bne 1b\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0++; + in_ptr1++; + in_ptr2++; + out_ptr0++; + } + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + } + } +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // !__aarch64__ +} + +static void depthwise_conv3x3S1_padding(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + float *in_ptr3 = inbuf + 3 * w; + + float *out_ptr0 = outbuf + 0 * outw; + float *out_ptr1 = outbuf + 1 * outw; + + int i; + for (i = 0; i + 1 < outh; i += 2) + { + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + if (i == 0) + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q8, #0\n" + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr0], %[in_ptr0], #12\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q2, %e[weight345][0]\n" + "vmul.f32 q11, q0, %e[weight345][1]\n" + "vmul.f32 q12, q2, %e[weight012][0]\n" + "vmul.f32 q13, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr1], %[in_ptr1], #12\n" + + "vmla.f32 q10, q2, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q12, q2, %e[weight345][0]\n" + "vmla.f32 q13, q0, %e[weight345][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr2], %[in_ptr2], #12\n" + + "vmla.f32 q12, q2, %e[weight678][0]\n" + "vmla.f32 q13, q0, %e[weight678][1]\n" + "vmla.f32 q15, q3, %f[weight678][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "1:\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight345][0]\n" + "vmul.f32 q11, q2, %e[weight345][1]\n" + "vmul.f32 q12, q0, %e[weight012][0]\n" + "vmul.f32 q13, q2, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q2, %e[weight678][1]\n" + "vmla.f32 q12, q0, %e[weight345][0]\n" + "vmla.f32 q13, q2, %e[weight345][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr2], %[in_ptr2], #16\n" + + "vmla.f32 q12, q0, %e[weight678][0]\n" + "vmla.f32 q13, q2, %e[weight678][1]\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vmla.f32 q15, q3, %f[weight678][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "bne 1b\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), + [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + + for (; remain > 0; remain--) + { + // TODO: when nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight345); + out0 = vmlaq_f32(out0, input1, weight678); + + float32x4_t out1 = vmulq_f32(input0, weight012); + out1 = vmlaq_f32(out1, input1, weight345); + out1 = vmlaq_f32(out1, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + out1 = vsetq_lane_f32(bias0, out1, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1)); + + float32x2_t out01 = vpadd_f32(out00, out11); + + *out_ptr0 = vget_lane_f32(out01, 0); + *out_ptr1 = vget_lane_f32(out01, 1); + + in_ptr0++; + in_ptr1++; + in_ptr2++; + out_ptr0++; + out_ptr1++; + } + + in_ptr0 += 1; + in_ptr1 += 1; + in_ptr2 += 1; + in_ptr3 += w; + } + else if (i == outh - 2) + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q8, #0\n" + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr0], %[in_ptr0], #12\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q2, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr1], %[in_ptr1], #12\n" + + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q10, q2, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + "vmul.f32 q12, q2, %e[weight012][0]\n" + "vmul.f32 q13, q0, %e[weight012][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr2], %[in_ptr2], #12\n" + + "vmla.f32 q10, q2, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q12, q2, %e[weight345][0]\n" + "vmla.f32 q13, q0, %e[weight345][1]\n" + + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "1:\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q2, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q2, %e[weight345][1]\n" + "vmul.f32 q12, q0, %e[weight012][0]\n" + "vmul.f32 q13, q2, %e[weight012][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr2], %[in_ptr2], #16\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q2, %e[weight678][1]\n" + "vmla.f32 q12, q0, %e[weight345][0]\n" + "vmla.f32 q13, q2, %e[weight345][1]\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "bne 1b\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), + [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: when nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + float32x4_t out1 = vmulq_f32(input1, weight012); + out1 = vmlaq_f32(out1, input2, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + out1 = vsetq_lane_f32(bias0, out1, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1)); + + float32x2_t out01 = vpadd_f32(out00, out11); + + *out_ptr0 = vget_lane_f32(out01, 0); + *out_ptr1 = vget_lane_f32(out01, 1); + + in_ptr0++; + in_ptr1++; + in_ptr2++; + out_ptr0++; + out_ptr1++; + } + } + else + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q8, #0\n" + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr0], %[in_ptr0], #12\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q2, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr1], %[in_ptr1], #12\n" + + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q10, q2, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + "vmul.f32 q12, q2, %e[weight012][0]\n" + "vmul.f32 q13, q0, %e[weight012][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr2], %[in_ptr2], #12\n" + + "vmla.f32 q10, q2, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q12, q2, %e[weight345][0]\n" + "vmla.f32 q13, q0, %e[weight345][1]\n" + + "pld [%[in_ptr3], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr3]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr3], %[in_ptr3], #12\n" + + "vmla.f32 q15, q2, %e[weight678][0]\n" + "vmla.f32 q15, q0, %e[weight678][1]\n" + "vmla.f32 q15, q3, %f[weight678][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "1:\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q2, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vand q15, %q[qbias0], %q[qbias0]\n" + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q2, %e[weight345][1]\n" + "vmul.f32 q12, q0, %e[weight012][0]\n" + "vmul.f32 q13, q2, %e[weight012][1]\n" + + "pld [%[in_ptr2], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vmla.f32 q15, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr2], %[in_ptr2], #16\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q2, %e[weight678][1]\n" + "vmla.f32 q12, q0, %e[weight345][0]\n" + "vmla.f32 q13, q2, %e[weight345][1]\n" + + "pld [%[in_ptr3], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr3]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vmla.f32 q15, q3, %f[weight345][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr3], %[in_ptr3], #16\n" + + "vmla.f32 q15, q0, %e[weight678][0]\n" + "vmla.f32 q15, q2, %e[weight678][1]\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vmla.f32 q15, q3, %f[weight678][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q15, q15, q12\n" + "vadd.f32 q14, q14, q11\n" + "vadd.f32 q15, q15, q13\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "vst1.f32 {d30-d31}, [%[out_ptr1]]!\n" + "bne 1b\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3), + + [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: when nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + float32x4_t input3 = vld1q_f32(in_ptr3); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + input3 = vsetq_lane_f32(0.0f, input3, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + float32x4_t out1 = vmulq_f32(input1, weight012); + out1 = vmlaq_f32(out1, input2, weight345); + out1 = vmlaq_f32(out1, input3, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + out1 = vsetq_lane_f32(bias0, out1, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + float32x2_t out11 = vadd_f32(vget_low_f32(out1), vget_high_f32(out1)); + + float32x2_t out01 = vpadd_f32(out00, out11); + + *out_ptr0 = vget_lane_f32(out01, 0); + *out_ptr1 = vget_lane_f32(out01, 1); + + in_ptr0++; + in_ptr1++; + in_ptr2++; + in_ptr3++; + out_ptr0++; + out_ptr1++; + } + in_ptr0 += w + 1; + in_ptr1 += w + 1; + in_ptr2 += w + 1; + in_ptr3 += w + 1; + } + + out_ptr0 += outw; + out_ptr1 += outw; + } + + for (; i < outh; i++) + { + // TODO:if i == 0, pad_top comes here. + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + + if (nn > 0) + { + __asm __volatile("vmov.i32 q8, #0\n" + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr0], %[in_ptr0], #12\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q2, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q8, q0, #3\n" + "vext.32 q3, q0, q1, #1\n" + "add %[in_ptr1], %[in_ptr1], #12\n" + + "vmla.f32 q10, q2, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "1:\n" + "add %[in_ptr0], %[in_ptr0], #16\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q2, %e[weight012][1]\n" + + "pld [%[in_ptr1], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + "add %[in_ptr1], %[in_ptr1], #16\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q2, %e[weight345][1]\n" + + "pld [%[in_ptr0], #192]\n" + "vld1.f32 {d0-d2}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q2, q0, q1, #1\n" + "vext.32 q3, q0, q1, #2\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: when nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0++; + in_ptr1++; + out_ptr0++; + out_ptr1++; + } + } + } +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // __aarch64__ +} + +static void depthwise_conv3x3S2_nopad(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + + const int tailstep = w - 2 * outw + w; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + + float *out_ptr0 = outbuf + 0 * outw; + + int i; + for (i = 0; i < outh; i++) + { + int nn = outw >> 2; + int remain = outw & 0x03; + + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + out_ptr0++; + } + + in_ptr0 += tailstep; + in_ptr1 += tailstep; + in_ptr2 += tailstep; + } + } + +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // __aarch64__ +} + +static void depthwise_conv3x3S2_padding00(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + + float *out_ptr0 = outbuf + 0 * outw; + + int i; + for (i = 0; i < outh; i++) + { + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + + if (i == outh - 1) + { + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + } + else + { + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + out_ptr0++; + } + + in_ptr0 += w; + in_ptr1 += w; + in_ptr2 += w; + } + } + } +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // !__aarch64__ +} + +static void depthwise_conv3x3S2_padding01(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + + float *out_ptr0 = outbuf + 0 * outw; + + int i; + for (i = 0; i < outh; i++) + { + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + + if (i == outh - 1) + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q2, #0\n" + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr0], %[in_ptr0], #28\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q3, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]\n" + "vmla.f32 q14, q1, %f[weight012][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr1], %[in_ptr1], #28\n" + + "vmla.f32 q10, q3, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + "vmla.f32 q14, q1, %f[weight345][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: if nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + } + else + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q2, #0\n" + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr0], %[in_ptr0], #28\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q3, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]\n" + "vmla.f32 q14, q1, %f[weight012][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr1], %[in_ptr1], #28\n" + + "vmla.f32 q10, q3, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]\n" + "vmla.f32 q14, q1, %f[weight345][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr2], %[in_ptr2], #28\n" + + "vmla.f32 q10, q3, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q14, q1, %f[weight678][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: if nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + out_ptr0++; + } + + in_ptr0 += w; + in_ptr1 += w; + in_ptr2 += w; + } + } + } + +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // __aarch64__ +} + +static void depthwise_conv3x3S2_padding10(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + + float *out_ptr0 = outbuf + 0 * outw; + + int i; + for (i = 0; i < outh; i++) + { + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + + // TODO: i == 0 && i == outh -1 + if (i == 0) + { + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight345][0]\n" + "vmul.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight345); + out0 = vmlaq_f32(out0, input1, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + + in_ptr2 += w; + } + else if (i == outh - 1) + { + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + } + else + { + if (nn > 0) + { + __asm __volatile("pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + out_ptr0++; + } + + in_ptr0 += w; + in_ptr1 += w; + in_ptr2 += w; + } + } + } + +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // __aarch64__ +} + +static void depthwise_conv3x3S2_padding11(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convMat_t &bias) +{ +#if !__aarch64__ + int w = in_mat.w; + int h = in_mat.h; + int outw = out_mat.w; + int outh = out_mat.h; + int channels = in_mat.c; + +#pragma omp parallel for + for (int c = 0; c < channels; c++) + { + const float *filter = kernel.data + c * 9; +#ifdef NCNN + float *inbuf = in_mat.data + c * alignSize(w * h, 16 / sizeof(float)); + float *outbuf = out_mat.data + c * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *inbuf = in_mat.data + c * w * h; + float *outbuf = out_mat.data + c * outw * outh; +#endif // NCNN + float bias0 = bias.data ? bias.data[c] : 0.0f; + + register float32x4_t weight012 asm("q4") = vld1q_f32(filter); + register float32x4_t weight345 asm("q5") = vld1q_f32(filter + 3); + register float32x4_t weight678 asm("q6") = vld1q_f32(filter + 6); + register float32x4_t qbias0 asm("q7") = vdupq_n_f32(bias0); + + float *in_ptr0 = inbuf + 0 * w; + float *in_ptr1 = inbuf + 1 * w; + float *in_ptr2 = inbuf + 2 * w; + + float *out_ptr0 = outbuf + 0 * outw; + + int i; + for (i = 0; i < outh; i++) + { + int nn = (outw >> 2) - 1; + int remain = (outw & 0x03) + 4; + + // TODO: i == 0 && i == outh - 1 + if (i == 0) + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q2, #0\n" + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr0], %[in_ptr0], #28\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q3, %e[weight345][0]\n" + "vmul.f32 q11, q0, %e[weight345][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]\n" + "vmla.f32 q14, q1, %f[weight345][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr1], %[in_ptr1], #28\n" + + "vmla.f32 q10, q3, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q14, q1, %f[weight678][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight345][0]\n" + "vmul.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: if nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight345); + out0 = vmlaq_f32(out0, input1, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + + in_ptr2 += w; + } + else if (i == outh - 1) + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q2, #0\n" + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr0], %[in_ptr0], #28\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q3, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]\n" + "vmla.f32 q14, q1, %f[weight012][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr1], %[in_ptr1], #28\n" + + "vmla.f32 q10, q3, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + "vmla.f32 q14, q1, %f[weight345][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: if nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + out_ptr0++; + } + } + else + { + if (nn > 0) + { + __asm __volatile("vmov.i32 q2, #0\n" + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr0], %[in_ptr0], #28\n" + + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q3, %e[weight012][0]\n" + "vmul.f32 q11, q0, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]\n" + "vmla.f32 q14, q1, %f[weight012][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr1], %[in_ptr1], #28\n" + + "vmla.f32 q10, q3, %e[weight345][0]\n" + "vmla.f32 q11, q0, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]\n" + "vmla.f32 q14, q1, %f[weight345][0]\n" + "vext.32 q3, q2, q0, #3\n" + "add %[in_ptr2], %[in_ptr2], #28\n" + + "vmla.f32 q10, q3, %e[weight678][0]\n" + "vmla.f32 q11, q0, %e[weight678][1]\n" + "vmla.f32 q14, q1, %f[weight678][0]\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "beq 2f\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vext.32 q3, q0, q2, #1\n" + + "1:\n" + "vand q14, %q[qbias0], %q[qbias0]\n" + "vmul.f32 q10, q0, %e[weight012][0]\n" + "vmul.f32 q11, q1, %e[weight012][1]\n" + + "pld [%[in_ptr1], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr1]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr1]]\n" + "vmla.f32 q14, q3, %f[weight012][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight345][0]\n" + "vmla.f32 q11, q1, %e[weight345][1]\n" + + "pld [%[in_ptr2], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr2]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr2]]\n" + "vmla.f32 q14, q3, %f[weight345][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vmla.f32 q10, q0, %e[weight678][0]\n" + "vmla.f32 q11, q1, %e[weight678][1]\n" + + "pld [%[in_ptr0], #256]\n" + "vld2.f32 {d0-d3}, [%[in_ptr0]]!\n" + "vld1.f32 {d4[0]}, [%[in_ptr0]]\n" + "vmla.f32 q14, q3, %f[weight678][0]\n" + "vext.32 q3, q0, q2, #1\n" + + "vadd.f32 q14, q14, q10\n" + "vadd.f32 q14, q14, q11\n" + + "subs %[nn], %[nn], #1\n" + "vst1.f32 {d28-d29}, [%[out_ptr0]]!\n" + "bne 1b\n" + "sub %[in_ptr0], %[in_ptr0], #32\n" + "2:\n" + : [in_ptr0] "+r"(in_ptr0), [in_ptr1] "+r"(in_ptr1), + [in_ptr2] "+r"(in_ptr2), [out_ptr0] "+r"(out_ptr0), [nn] "+r"(nn) + : [weight012] "w"(weight012), [weight345] "w"(weight345), + [weight678] "w"(weight678), [qbias0] "w"(qbias0) + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", + "q15", "cc", "memory"); + } + for (; remain > 0; remain--) + { + // TODO: if nn == 0, pad_left comes here. + float32x4_t input0 = vld1q_f32(in_ptr0); + float32x4_t input1 = vld1q_f32(in_ptr1); + float32x4_t input2 = vld1q_f32(in_ptr2); + + if (remain == 1) + { + input0 = vsetq_lane_f32(0.0f, input0, 2); + input1 = vsetq_lane_f32(0.0f, input1, 2); + input2 = vsetq_lane_f32(0.0f, input2, 2); + } + + float32x4_t out0 = vmulq_f32(input0, weight012); + out0 = vmlaq_f32(out0, input1, weight345); + out0 = vmlaq_f32(out0, input2, weight678); + + out0 = vsetq_lane_f32(bias0, out0, 3); + + float32x2_t out00 = vadd_f32(vget_low_f32(out0), vget_high_f32(out0)); + + float32x2_t out01 = vpadd_f32(out00, out00); + + *out_ptr0 = vget_lane_f32(out01, 0); + + in_ptr0 += 2; + in_ptr1 += 2; + in_ptr2 += 2; + out_ptr0++; + } + + in_ptr0 += w; + in_ptr1 += w; + in_ptr2 += w; + } + } + } +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)bias; +#endif // __aarch64__ +} + +static void depthwise_conv_colmajor(const convMat_t &in_mat, convMat_t &out_mat, + const convMat_t &kernel, const convParams_t &in_param) +{ +#if __aarch64__ + const int w = in_mat.w; + const int h = in_mat.h; + const int outw = out_mat.w; + const int outh = out_mat.h; + const int channels = out_mat.c; + const int stridew = in_param.stride_w; + const int strideh = in_param.stride_h; + const int padding = in_param.padding; + const int padw = in_param.pad_w; + const int padh = in_param.pad_h; + +#pragma omp parallel for + for (int oh = 0; oh < outh; oh++) + { + const float *input_data0 = in_mat.data + (oh * strideh - padh) * w * channels; + + memset(out_mat.data + oh * outw * channels, 0x00, outw * channels * sizeof(float)); + + for (int kh = 0; kh < in_param.kernel_h; kh++) + { + for (int kw = 0; kw < in_param.kernel_w; kw++) + { + const float *kernel_data = kernel.data + (kh * in_param.kernel_w + kw) * channels; + const float *input_data1 = input_data0 + (kh * w + kw) * channels; + + if (padding && ((oh * strideh + kh < padh) || (oh * strideh + kh >= padh + h))) + { + continue; + } + + int ow = 0; + for (; ow + 3 < outw; /*ow += 4*/) + { + if (((ow + 3) * stridew + kw < padw) || (ow * stridew + kw >= padw + w)) + { + ow += 4; + continue; + } + else if ((ow + 3) * stridew + kw >= padw + w) + { + break; + } + else if (ow * stridew + kw < padw) + { + int delta = (padw - kw) / stridew - ow; + delta += (padw - kw) % stridew ? 1 : 0; + ow += delta; + continue; + } + + int nn = channels >> 2; + int remain = channels & 0x03; + + const float *input_r0 = input_data1 + (ow * stridew - padw) * channels; + + const float *input_r1 = input_r0 + stridew * channels; + const float *input_r2 = input_r1 + stridew * channels; + const float *input_r3 = input_r2 + stridew * channels; + const float *weights_data = kernel_data; + float *output_r0 = out_mat.data + (oh * outw + ow) * channels; + float *output_r1 = output_r0 + channels; + float *output_r2 = output_r1 + channels; + float *output_r3 = output_r2 + channels; + + if (nn > 0) + { + int _n = (nn + 1) >> 1; + int oddn = nn & 1; + + asm volatile("subs %[_n], %[_n], #1\n" + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_r0]], #16\n" + "ld1 {v6.4s}, [%[input_r1]], #16\n" + "ld1 {v7.4s}, [%[input_r2]], #16\n" + "ld1 {v8.4s}, [%[input_r3]], #16\n" + "beq 1f\n" + + "0:\n" + "ld1 {v24.4s, v25.4s}, [%[output_r0]]\n" + "ld1 {v26.4s, v27.4s}, [%[output_r1]]\n" + "ld1 {v28.4s, v29.4s}, [%[output_r2]]\n" + "ld1 {v30.4s, v31.4s}, [%[output_r3]]\n" + + "ld1 {v9.4s}, [%[weights_data]], #16\n" + "ld1 {v10.4s}, [%[input_r0]], #16\n" + "ld1 {v11.4s}, [%[input_r1]], #16\n" + "ld1 {v12.4s}, [%[input_r2]], #16\n" + "ld1 {v13.4s}, [%[input_r3]], #16\n" + + "fmla v24.4s, v4.4s, v5.4s\n" + "fmla v26.4s, v4.4s, v6.4s\n" + + "fmla v28.4s, v4.4s, v7.4s\n" + "fmla v30.4s, v4.4s, v8.4s\n" + + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_r0]], #16\n" + "ld1 {v6.4s}, [%[input_r1]], #16\n" + "ld1 {v7.4s}, [%[input_r2]], #16\n" + "ld1 {v8.4s}, [%[input_r3]], #16\n" + + "fmla v25.4s, v9.4s, v10.4s\n" + "fmla v27.4s, v9.4s, v11.4s\n" + + "fmla v29.4s, v9.4s, v12.4s\n" + "fmla v31.4s, v9.4s, v13.4s\n" + + "st1 {v24.4s, v25.4s}, [%[output_r0]], #32\n" + "st1 {v26.4s, v27.4s}, [%[output_r1]], #32\n" + "st1 {v28.4s, v29.4s}, [%[output_r2]], #32\n" + "st1 {v30.4s, v31.4s}, [%[output_r3]], #32\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v24.4s}, [%[output_r0]]\n" + "ld1 {v26.4s}, [%[output_r1]]\n" + "ld1 {v28.4s}, [%[output_r2]]\n" + "ld1 {v30.4s}, [%[output_r3]]\n" + "cmp %[oddn], #1\n" + + "fmla v24.4s, v4.4s, v5.4s\n" + "fmla v26.4s, v4.4s, v6.4s\n" + + "fmla v28.4s, v4.4s, v7.4s\n" + "fmla v30.4s, v4.4s, v8.4s\n" + + "st1 {v24.4s}, [%[output_r0]], #16\n" + "st1 {v26.4s}, [%[output_r1]], #16\n" + "st1 {v28.4s}, [%[output_r2]], #16\n" + "st1 {v30.4s}, [%[output_r3]], #16\n" + + "beq 2f\n" + "ld1 {v25.4s}, [%[output_r0]]\n" + "ld1 {v27.4s}, [%[output_r1]]\n" + "ld1 {v29.4s}, [%[output_r2]]\n" + "ld1 {v31.4s}, [%[output_r3]]\n" + + "ld1 {v9.4s}, [%[weights_data]], #16\n" + "ld1 {v10.4s}, [%[input_r0]], #16\n" + "ld1 {v11.4s}, [%[input_r1]], #16\n" + "ld1 {v12.4s}, [%[input_r2]], #16\n" + "ld1 {v13.4s}, [%[input_r3]], #16\n" + + "fmla v25.4s, v9.4s, v10.4s\n" + "fmla v27.4s, v9.4s, v11.4s\n" + + "fmla v29.4s, v9.4s, v12.4s\n" + "fmla v31.4s, v9.4s, v13.4s\n" + + "st1 {v25.4s}, [%[output_r0]], #16\n" + "st1 {v27.4s}, [%[output_r1]], #16\n" + "st1 {v29.4s}, [%[output_r2]], #16\n" + "st1 {v31.4s}, [%[output_r3]], #16\n" + "2:\n" + : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0), + [input_r1] "+r"(input_r1), [input_r2] "+r"(input_r2), + [input_r3] "+r"(input_r3), [output_r0] "+r"(output_r0), + [output_r1] "+r"(output_r1), [output_r2] "+r"(output_r2), + [output_r3] "+r"(output_r3), [_n] "+r"(_n) + : [oddn] "r"(oddn) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + } + if (remain >= 2) + { + asm volatile( + "ld1 {v24.2s}, [%[output_r0]]\n" + "ld1 {v26.2s}, [%[output_r1]]\n" + "ld1 {v28.2s}, [%[output_r2]]\n" + "ld1 {v30.2s}, [%[output_r3]]\n" + "ld1 {v4.2s}, [%[weights_data]], #8\n" + "ld1 {v5.2s}, [%[input_r0]], #8\n" + + "ld1 {v6.2s}, [%[input_r1]], #8\n" + "ld1 {v7.2s}, [%[input_r2]], #8\n" + "ld1 {v8.2s}, [%[input_r3]], #8\n" + + "fmla v24.2s, v4.2s, v5.2s\n" + "fmla v26.2s, v4.2s, v6.2s\n" + + "fmla v28.2s, v4.2s, v7.2s\n" + "fmla v30.2s, v4.2s, v8.2s\n" + + "st1 {v24.2s}, [%[output_r0]], #8\n" + "st1 {v26.2s}, [%[output_r1]], #8\n" + "st1 {v28.2s}, [%[output_r2]], #8\n" + "st1 {v30.2s}, [%[output_r3]], #8\n" + : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0), + [input_r1] "+r"(input_r1), [input_r2] "+r"(input_r2), [input_r3] "+r"(input_r3), + [output_r0] "+r"(output_r0), [output_r1] "+r"(output_r1), + [output_r2] "+r"(output_r2), [output_r3] "+r"(output_r3) + : + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v24", "v26", "v28", "v30"); + remain -= 2; + } + + if (remain > 0) + { + *output_r0++ += (*weights_data) * (*input_r0++); + *output_r1++ += (*weights_data++) * (*input_r1++); + *output_r2++ += (*weights_data) * (*input_r2++); + *output_r3++ += (*weights_data++) * (*input_r3++); + } + ow += 4; + } + + for (; ow + 1 < outw; /*ow += 2*/) + { + if (padding) + { + if (((ow + 1) * stridew + kw < padw) || (ow * stridew + kw >= padw + w)) + { + ow += 2; + continue; + } + else if ((ow + 1) * stridew + kw >= padw + w) + { + break; + } + else if (ow * stridew + kw < padw) + { + ow++; + continue; + } + } + + int nn = channels >> 2; + int remain = channels & 0x03; + + const float *input_r0 = input_data1 + (ow * stridew - padw) * channels; + + const float *input_r1 = input_r0 + stridew * channels; + const float *weights_data = kernel_data; + float *output_r0 = out_mat.data + (oh * outw + ow) * channels; + float *output_r1 = output_r0 + channels; + + if (nn > 0) + { + int _n = (nn + 1) >> 1; + int oddn = nn & 1; + + asm volatile("subs %[_n], %[_n], #1\n" + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_r0]], #16\n" + "ld1 {v6.4s}, [%[input_r1]], #16\n" + "beq 1f\n" + + "0:\n" + "ld1 {v24.4s, v25.4s}, [%[output_r0]]\n" + "ld1 {v26.4s, v27.4s}, [%[output_r1]]\n" + + "ld1 {v9.4s}, [%[weights_data]], #16\n" + "ld1 {v10.4s}, [%[input_r0]], #16\n" + "ld1 {v11.4s}, [%[input_r1]], #16\n" + + "fmla v24.4s, v4.4s, v5.4s\n" + "fmla v26.4s, v4.4s, v6.4s\n" + + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_r0]], #16\n" + "ld1 {v6.4s}, [%[input_r1]], #16\n" + + "fmla v25.4s, v9.4s, v10.4s\n" + "fmla v27.4s, v9.4s, v11.4s\n" + + "st1 {v24.4s, v25.4s}, [%[output_r0]], #32\n" + "st1 {v26.4s, v27.4s}, [%[output_r1]], #32\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v24.4s}, [%[output_r0]]\n" + "ld1 {v26.4s}, [%[output_r1]]\n" + "cmp %[oddn], #1\n" + + "fmla v24.4s, v4.4s, v5.4s\n" + "fmla v26.4s, v4.4s, v6.4s\n" + + "st1 {v24.4s}, [%[output_r0]], #16\n" + "st1 {v26.4s}, [%[output_r1]], #16\n" + + "beq 2f\n" + "ld1 {v25.4s}, [%[output_r0]]\n" + "ld1 {v27.4s}, [%[output_r1]]\n" + + "ld1 {v9.4s}, [%[weights_data]], #16\n" + "ld1 {v10.4s}, [%[input_r0]], #16\n" + "ld1 {v11.4s}, [%[input_r1]], #16\n" + + "fmla v25.4s, v9.4s, v10.4s\n" + "fmla v27.4s, v9.4s, v11.4s\n" + + "st1 {v25.4s}, [%[output_r0]], #16\n" + "st1 {v27.4s}, [%[output_r1]], #16\n" + "2:\n" + : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0), + [input_r1] "+r"(input_r1), [output_r0] "+r"(output_r0), + [output_r1] "+r"(output_r1), [_n] "+r"(_n) + : [oddn] "r"(oddn) + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + } + if (remain >= 2) + { + asm volatile("ld1 {v24.2s}, [%[output_r0]]\n" + "ld1 {v26.2s}, [%[output_r1]]\n" + "ld1 {v4.2s}, [%[weights_data]], #8\n" + "ld1 {v5.2s}, [%[input_r0]], #8\n" + + "ld1 {v6.2s}, [%[input_r1]], #8\n" + + "fmla v24.2s, v4.2s, v5.2s\n" + "fmla v26.2s, v4.2s, v6.2s\n" + + "st1 {v24.2s}, [%[output_r0]], #8\n" + "st1 {v26.2s}, [%[output_r1]], #8\n" + : [weights_data] "+r"(weights_data), [input_r0] "+r"(input_r0), + [input_r1] "+r"(input_r1), [output_r0] "+r"(output_r0), + [output_r1] "+r"(output_r1) + : + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v24", "v26", "v28", + "v30"); + remain -= 2; + } + + if (remain > 0) + { + *output_r0++ += (*weights_data) * (*input_r0++); + *output_r1++ += (*weights_data++) * (*input_r1++); + } + ow += 2; + } + + for (; ow < outw; ow++) + { + const float *input_data = input_data1 + (ow * stridew - padw) * channels; + + if (padding && ((ow * stridew + kw < padw) || (ow * strideh + kw >= padw + w))) + { + continue; + } + + int nn = channels >> 2; + int remain = channels & 0x03; + + const float *weights_data = kernel_data; + float *output_data = out_mat.data + (oh * outw + ow) * channels; + + if (nn > 0) + { + int _n = (nn + 1) >> 1; + int oddn = nn & 1; + + asm volatile("subs %[_n], %[_n], #1\n" + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_data]], #16\n" + "beq 1f\n" + + "0:\n" + "ld1 {v30.4s, v31.4s}, [%[output_data]]\n" + "ld1 {v6.4s}, [%[weights_data]], #16\n" + "ld1 {v7.4s}, [%[input_data]], #16\n" + "fmla v30.4s, v4.4s, v5.4s\n" + + "ld1 {v4.4s}, [%[weights_data]], #16\n" + "ld1 {v5.4s}, [%[input_data]], #16\n" + "fmla v31.4s, v6.4s, v7.4s\n" + + "st1 {v30.4s, v31.4s}, [%[output_data]], #32\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v30.4s}, [%[output_data]]\n" + "cmp %[oddn], #1\n" + "fmla v30.4s, v4.4s, v5.4s\n" + "st1 {v30.4s}, [%[output_data]], #16\n" + "beq 2f\n" + "ld1 {v31.4s}, [%[output_data]]\n" + "ld1 {v6.4s}, [%[weights_data]], #16\n" + "ld1 {v7.4s}, [%[input_data]], #16\n" + "fmla v31.4s, v6.4s, v7.4s\n" + + "st1 {v31.4s}, [%[output_data]], #16\n" + "2:\n" + : [weights_data] "+r"(weights_data), [input_data] "+r"(input_data), + [output_data] "+r"(output_data), [_n] "+r"(_n) + : [oddn] "r"(oddn) + : "cc", "memory", "v4", "v5", "v30", "v31"); + } + if (remain >= 2) + { + asm volatile("ld1 {v30.2s}, [%[output_data]]\n" + "ld1 {v4.2s}, [%[weights_data]], #8\n" + "ld1 {v5.2s}, [%[input_data]], #8\n" + + "fmla v30.2s, v4.2s, v5.2s\n" + + "st1 {v30.2s}, [%[output_data]], #8\n" + : [weights_data] "+r"(weights_data), [input_data] "+r"(input_data), + [output_data] "+r"(output_data) + : + : "cc", "memory", "v4", "v5", "v30"); + remain -= 2; + } + + if (remain > 0) + { + *output_data++ += (*weights_data++) * (*input_data++); + } + } + } + } + } +#else // __aarch64__ + (void)in_mat; + (void)out_mat; + (void)kernel; + (void)in_param; +#endif // __aarch64__ +} + +void srcn_depthwise_conv(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convMat_t &bias, const convParams_t &in_param, int num_threads, + convType_t conv_type) +{ + omp_set_num_threads(num_threads); + + if (conv_type == col_major) + { + depthwise_conv_colmajor(in_mat, out_mat, weights_mat, in_param); + return; + } + + else if (conv_type == row_major) + { + if (in_param.kernel_w == 3 && in_param.kernel_h == 3 && in_param.dilation_w == 1 && + in_param.dilation_h == 1) + { + if (in_param.stride_w == 1 && in_param.stride_h == 1) + { + if (in_param.padding == 0) + depthwise_conv3x3S1_nopad(in_mat, out_mat, weights_mat, bias); + else + depthwise_conv3x3S1_padding(in_mat, out_mat, weights_mat, bias); + } + else if (in_param.stride_w == 2 && in_param.stride_h == 2) + { + if (in_param.padding == 0) + depthwise_conv3x3S2_nopad(in_mat, out_mat, weights_mat, bias); + else + { + if (in_param.pad_w == 0 && in_param.pad_h == 0) + depthwise_conv3x3S2_padding00(in_mat, out_mat, weights_mat, bias); + else if (in_param.pad_w == 0 && in_param.pad_h == 1) + depthwise_conv3x3S2_padding10(in_mat, out_mat, weights_mat, bias); + else if (in_param.pad_w == 1 && in_param.pad_h == 0) + depthwise_conv3x3S2_padding01(in_mat, out_mat, weights_mat, bias); + else if (in_param.pad_w == 1 && in_param.pad_h == 1) + depthwise_conv3x3S2_padding11(in_mat, out_mat, weights_mat, bias); + } + } + } + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/direct_conv_colmajor.cc b/compute/ncnn/src/srcn/direct_conv_colmajor.cc new file mode 100644 index 000000000..300235222 --- /dev/null +++ b/compute/ncnn/src/srcn/direct_conv_colmajor.cc @@ -0,0 +1,5872 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include <stdlib.h> +#include <arm_neon.h> +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +#if __aarch64__ +static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img0 = + bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + float *out0 = top_blob.data + p * outw * outch; + + // clear output + if (m == 0) + { + for (int j = 0; j < outw * outch; j++) + { + *(out0 + j) = 0.f; + } + } + + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + const float *img1 = img0 + p * w * inch * _stride; + + int q = 0; + for (; q + 3 < outw; /*q += 4*/) + { + if (padding) + { + if (((q + 3) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 4; + img1 += inch * _stride * 4; + q += 4; + continue; + } + else if ((q + 3) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + int delta = (pad_left - m % kernel_w) / _stride - q; + delta += (pad_left - m % kernel_w) % _stride ? 1 : 0; + out0 += outch * delta; + img1 += inch * _stride * delta; + q += delta; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *_x2 = img1 + inch * _stride * 2; + const float *_x3 = img1 + inch * _stride * 3; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + register float32x4_t rx2 asm("v16") = vld1q_f32(_x2); + register float32x4_t rx3 asm("v17") = vld1q_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v30.2s, v8.2s, %[rx2].s[2]\n" + "fmla v31.2s, v8.2s, %[rx3].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + "fmla v30.2s, v9.2s, %[rx2].s[3]\n" + "fmla v31.2s, v9.2s, %[rx3].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30", + "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x2 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x2 + 3)); + + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x3 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x3 + 3)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + _x2 += 4; + _x3 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)); + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + _x2 += 2; + _x3 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + *outptr2 += (*kernel0) * (*_x2); + *outptr3 += (*kernel0) * (*_x3); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + _x0 += 1; + _x1 += 1; + _x2 += 1; + _x3 += 1; + } + + img1 += inch * 4 * _stride; + out0 += outch * 4; + q += 4; + } + + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + + float *outptr0 = out0; + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int p = 0; p < outh; p++) + { + const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch; + float *out = top_blob.data + p * outw * outch; + + // clear output + for (int j = 0; j < outw * outch; j++) + { + *(out + j) = 0.f; + } + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + float *out0 = out; + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch; + + int q = 0; + for (; q + 3 < outw; /*q += 4*/) + { + if (padding) + { + if (((q + 3) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 4; + img1 += inch * _stride * 4; + q += 4; + continue; + } + else if ((q + 3) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + int delta = (pad_left - m % kernel_w) / _stride - q; + delta += (pad_left - m % kernel_w) % _stride ? 1 : 0; + out0 += outch * delta; + img1 += inch * _stride * delta; + q += delta; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *_x2 = img1 + inch * _stride * 2; + const float *_x3 = img1 + inch * _stride * 3; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + register float32x4_t rx2 asm("v16") = vld1q_f32(_x2); + register float32x4_t rx3 asm("v17") = vld1q_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v30.4s, v12.4s, %[rx2].s[2]\n" + "fmla v31.4s, v12.4s, %[rx3].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + "fmla v30.4s, v13.4s, %[rx2].s[3]\n" + "fmla v31.4s, v13.4s, %[rx3].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v30.4s, v8.4s, %[rx2].s[2]\n" + "fmla v31.4s, v8.4s, %[rx3].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + "fmla v30.4s, v9.4s, %[rx2].s[3]\n" + "fmla v31.4s, v9.4s, %[rx3].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n), [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v30.2s, v8.2s, %[rx2].s[2]\n" + "fmla v31.2s, v8.2s, %[rx3].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + "fmla v30.2s, v9.2s, %[rx2].s[3]\n" + "fmla v31.2s, v9.2s, %[rx3].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15", "v30", + "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x2 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x2 + 3)); + + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x3 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x3 + 3)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + _x2 += 4; + _x3 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v30.4s, v11.4s, %[rx2].s[1]\n" + "fmla v31.4s, v11.4s, %[rx3].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v30.4s, v7.4s, %[rx2].s[1]\n" + "fmla v31.4s, v7.4s, %[rx3].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v30.2s, v7.2s, %[rx2].s[1]\n" + "fmla v31.2s, v7.2s, %[rx3].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + *outptr2 += (*kernel0) * (*_x2) + (*(kernel0 + outch)) * (*(_x2 + 1)); + *outptr3 += (*kernel0) * (*_x3) + (*(kernel0 + outch)) * (*(_x3 + 1)); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + _x2 += 2; + _x3 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + register float32x2_t rx2 asm("v16") = vld1_dup_f32(_x2); + register float32x2_t rx3 asm("v17") = vld1_dup_f32(_x3); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + float *outptr2 = out0 + outch * 2; + float *outptr3 = out0 + outch * 3; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile( + "cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v30.4s, v10.4s, %[rx2].s[0]\n" + "fmla v31.4s, v10.4s, %[rx3].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + "ld1 {v30.4s}, [%[outptr2]]\n" + "ld1 {v31.4s}, [%[outptr3]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v30.4s, v6.4s, %[rx2].s[0]\n" + "fmla v31.4s, v6.4s, %[rx3].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "st1 {v30.4s}, [%[outptr2]], #16\n" + "st1 {v31.4s}, [%[outptr3]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), + [_n] "+r"(_n), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn), [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15", "v30", "v31"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + "ld1 {v30.2s}, [%[outptr2]]\n" + "ld1 {v31.2s}, [%[outptr3]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v30.2s, v6.2s, %[rx2].s[0]\n" + "fmla v31.2s, v6.2s, %[rx3].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + "st1 {v30.2s}, [%[outptr2]], #8\n" + "st1 {v31.2s}, [%[outptr3]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3) + : [rx0] "w"(rx0), [rx1] "w"(rx1), + + [rx2] "w"(rx2), [rx3] "w"(rx3) + : "cc", "memory", "x0", "v6", "v14", "v15", "v30", "v31"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + *outptr2 += (*kernel0) * (*_x2); + *outptr3 += (*kernel0) * (*_x3); + + kernel0++; + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + + _x0 += 1; + _x1 += 1; + _x2 += 1; + _x3 += 1; + } + + img1 += inch * 4 * _stride; + out0 += outch * 4; + q += 4; + } + + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("v5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v15.4s, v12.4s, %[rx1].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + "fmla v15.4s, v13.4s, %[rx1].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v15.4s, v8.4s, %[rx1].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + "fmla v15.4s, v9.4s, %[rx1].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v15.2s, v8.2s, %[rx1].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + "fmla v15.2s, v9.2s, %[rx1].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v15.4s, v11.4s, %[rx1].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v15.4s, v7.4s, %[rx1].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v15.2s, v7.2s, %[rx1].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v7", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("v5") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v15.4s, v10.4s, %[rx1].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + "ld1 {v15.4s}, [%[outptr1]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v15.4s, v6.4s, %[rx1].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "st1 {v15.4s}, [%[outptr1]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14", "v15"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + "ld1 {v15.2s}, [%[outptr1]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v15.2s, v6.2s, %[rx1].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + "st1 {v15.2s}, [%[outptr1]], #8\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) + : "cc", "memory", "x0", "v6", "v14", "v15"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("v4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v13.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + "fmla v14.4s, v12.4s, %[rx0].s[2]\n" + "fmla v14.4s, v13.4s, %[rx0].s[3]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + "fmla v14.4s, v8.4s, %[rx0].s[2]\n" + "fmla v14.4s, v9.4s, %[rx0].s[3]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v8.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v9.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + "fmla v14.2s, v8.2s, %[rx0].s[2]\n" + "fmla v14.2s, v9.2s, %[rx0].s[3]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v8", "v9", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_f32(_x0); + + float *outptr0 = out0; + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + "fmla v14.4s, v11.4s, %[rx0].s[1]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + "fmla v14.4s, v7.4s, %[rx0].s[1]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v7", "v10", "v11", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + "add x0, x0, %[stride]\n" + "ld1 {v7.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + "fmla v14.2s, v7.2s, %[rx0].s[1]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v7", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("v4") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + "beq 1f\n" + + "0:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v10.4s}, [x0]\n" + + "fmla v14.4s, v10.4s, %[rx0].s[0]\n" + + "cmp %[oddn], #1\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + + "bne 3f\n" + + "2:\n" + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "ld1 {v6.4s}, [x0]\n" + + "ld1 {v14.4s}, [%[outptr0]]\n" + + "fmla v14.4s, v6.4s, %[rx0].s[0]\n" + + "st1 {v14.4s}, [%[outptr0]], #16\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) + : "cc", "memory", "x0", "v6", "v10", "v14"); + } + + if (remain >= 2) + { + asm volatile("ld1 {v14.2s}, [%[outptr0]]\n" + + "mov x0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "ld1 {v6.2s}, [x0]\n" + + "fmla v14.2s, v6.2s, %[rx0].s[0]\n" + + "st1 {v14.2s}, [%[outptr0]], #8\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) + : "cc", "memory", "x0", "v6", "v14"); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +#else // __aarch64__ +static void direct_conv_l(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img0 = + bottom_blob.data + (m / kernel_w - pad_top) * w * inch + (m % kernel_w - pad_left) * inch; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + float *out0 = top_blob.data + p * outw * outch; + // clear output. + if (m == 0) + { + for (int j = 0; j < outw * outch; j++) + { + *(out0 + j) = 0.f; + } + } + + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + const float *img1 = img0 + p * w * inch * _stride; + + int q = 0; + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + w) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("q5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d30, d12, %e[rx1][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d30, d14, %e[rx1][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d30, d16, %f[rx1][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + "vmla.f32 d30, d18, %f[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + "vmla.f32 d30, d14, %P[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w) >= pad_left + bottom_blob.w) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14" + +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} + +static void direct_conv_s(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &_kernel, const int _stride, const int padding, + const int pad_top, const int pad_left) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + const int kernel_w = _kernel.w; + const int kernel_h = _kernel.h; + +#ifdef _OPENMP +#pragma omp parallel for +#endif // _OPENMP + for (int p = 0; p < outh; p++) + { + const float *img0 = bottom_blob.data + (p * _stride - pad_top) * w * inch; + float *out = top_blob.data + p * outw * outch; + + // clear output. + for (int j = 0; j < outw * outch; j++) + { + *(out + j) = 0.f; + } + + for (int m = 0; m < kernel_w * kernel_h; m++) + { + if (padding) + { + if (((p * _stride + m / kernel_w) < pad_top) || (p * _stride + m / kernel_w >= pad_top + h)) + { + continue; + } + } + + float *out0 = out; + const float *_kernel0 = _kernel.data + m * inch * outch; + const float *img1 = img0 + (m / kernel_w) * w * inch + (m % kernel_w - pad_left) * inch; + + int q = 0; + for (; q + 1 < outw; /*q += 2*/) + { + if (padding) + { + if (((q + 1) * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + out0 += outch * 2; + img1 += inch * _stride * 2; + q += 2; + continue; + } + else if (q * _stride + m % kernel_w < pad_left) + { + out0 += outch; + img1 += inch * _stride; + q++; + continue; + } + else if ((q + 1) * _stride + m % kernel_w >= pad_left + w) + { + break; + } + } + + const float *_x0 = img1; + const float *_x1 = img1 + inch * _stride; + + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + register float32x4_t rx1 asm("q5") = vld1q_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q15, q10, %e[rx1][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q15, q11, %e[rx1][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q15, q12, %f[rx1][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + "vmla.f32 q15, q13, %f[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q15, q6, %e[rx1][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q15, q7, %e[rx1][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q15, q8, %f[rx1][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + "vmla.f32 q15, q9, %f[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15"); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d30, d12, %e[rx1][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d30, d14, %e[rx1][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d30, d16, %f[rx1][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + "vmla.f32 d30, d18, %f[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x1 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x1 + 3)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch * 3; + _x0 += 4; + _x1 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + "vmla.f32 q15, q11, %P[rx1][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + "vmla.f32 q15, q7, %P[rx1][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + "vmla.f32 d30, d14, %P[rx1][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [stride] "r"(stride), [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + *outptr1 += (*kernel0) * (*_x1) + (*(kernel0 + outch)) * (*(_x1 + 1)); + + kernel0++; + outptr0++; + outptr1++; + } + + kernel0 += outch; + _x0 += 2; + _x1 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + register float32x2_t rx1 asm("d10") = vld1_dup_f32(_x1); + + float *outptr0 = out0; + float *outptr1 = out0 + outch; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q15, q10, %P[rx1][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + "vld1.f32 {d30-d31}, [%[outptr1]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q15, q6, %P[rx1][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vst1.f32 {d30-d31}, [%[outptr1]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), [_n] "+r"(_n) + : [rx0] "w"(rx0), [rx1] "w"(rx1), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + "vld1.f32 {d30}, [%[outptr1]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d30, d12, %P[rx1][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + "vst1.f32 {d30}, [%[outptr1]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1) + : [rx0] "w"(rx0), [rx1] "w"(rx1) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + *outptr1 += (*kernel0) * (*_x1); + + kernel0++; + outptr0++; + outptr1++; + } + + _x0 += 1; + _x1 += 1; + } + + img1 += inch * 2 * _stride; + out0 += outch * 2; + q += 2; + } + + for (; q < outw; q++) + { + if (padding) + { + if ((q * _stride + m % kernel_w < pad_left) || + (q * _stride + m % kernel_w >= pad_left + w)) + { + img1 += inch * _stride; + out0 += outch; + continue; + } + } + + const float *_x0 = img1; + const float *kernel0 = _kernel0; + + int i = 0; + for (; i + 3 < inch; i += 4) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x4_t rx0 asm("q4") = vld1q_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + + "vmla.f32 q14, q10, %e[rx0][0]\n" + "vmla.f32 q14, q11, %e[rx0][1]\n" + "vmla.f32 q14, q12, %f[rx0][0]\n" + "vmla.f32 q14, q13, %f[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %e[rx0][0]\n" + "vmla.f32 q14, q7, %e[rx0][1]\n" + "vmla.f32 q14, q8, %f[rx0][0]\n" + "vmla.f32 q14, q9, %f[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d16}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d18}, [r0]\n" + + "vmla.f32 d28, d12, %e[rx0][0]\n" + "vmla.f32 d28, d14, %e[rx0][1]\n" + "vmla.f32 d28, d16, %f[rx0][0]\n" + "vmla.f32 d28, d18, %f[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)) + + (*(kernel0 + outch * 2)) * (*(_x0 + 2)) + + (*(kernel0 + outch * 3)) * (*(_x0 + 3)); + + kernel0++; + outptr0++; + } + + kernel0 += outch * 3; + _x0 += 4; + } + + for (; i + 1 < inch; i += 2) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_f32(_x0); + + float *outptr0 = out0; + + int stride = outch << 2; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + "vmla.f32 q14, q11, %P[rx0][1]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + "vmla.f32 q14, q7, %P[rx0][1]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [stride] "r"(stride), [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q10", "q11", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + "add r0, r0, %[stride]\n" + "vld1.f32 {d14}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + "vmla.f32 d28, d14, %P[rx0][1]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [stride] "r"(stride), [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0) + (*(kernel0 + outch)) * (*(_x0 + 1)); + + kernel0++; + outptr0++; + } + + kernel0 += outch; + _x0 += 2; + } + + for (; i < inch; i++) + { + int nn = outch >> 2; + int remain = outch & 0x03; + + register float32x2_t rx0 asm("d8") = vld1_dup_f32(_x0); + + float *outptr0 = out0; + + if (nn > 0) + { + int _n = nn >> 1; + int oddn = nn & 1; + + asm volatile("cmp %[_n], #0\n" + "beq 2f\n" + "subs %[_n], %[_n], #1\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "beq 1f\n" + + "0:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "subs %[_n], %[_n], #1\n" + "bne 0b\n" + + "1:\n" + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d20-d21}, [r0]\n" + + "vmla.f32 q14, q10, %P[rx0][0]\n" + + "cmp %[oddn], #1\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + + "bne 3f\n" + + "2:\n" + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #16\n" + "vld1.f32 {d12-d13}, [r0]\n" + + "vld1.f32 {d28-d29}, [%[outptr0]]\n" + + "vmla.f32 q14, q6, %P[rx0][0]\n" + + "vst1.f32 {d28-d29}, [%[outptr0]]!\n" + "3:\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0), [_n] "+r"(_n) + : [rx0] "w"(rx0), [oddn] "r"(oddn) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q10", "q14" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + } + + if (remain >= 2) + { + asm volatile("vld1.f32 {d28}, [%[outptr0]]\n" + + "mov r0, %[kernel0]\n" + "add %[kernel0], %[kernel0], #8\n" + "vld1.f32 {d12}, [r0]\n" + + "vmla.f32 d28, d12, %P[rx0][0]\n" + + "vst1.f32 {d28}, [%[outptr0]]!\n" + : [kernel0] "+r"(kernel0), [outptr0] "+r"(outptr0) + : [rx0] "w"(rx0) +#ifndef _OPENMP + : "cc", "memory", "r0", "q6", "q14", "q15" +#else // _OPENMP + : "cc", "memory", "r0", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", + "q14", "q15" +#endif // _OPENMP + ); + remain -= 2; + } + + if (remain == 1) + { + *outptr0 += (*kernel0) * (*_x0); + + kernel0++; + outptr0++; + } + + _x0 += 1; + } + + img1 += inch * _stride; + out0 += outch; + } + } + } +} +#endif // __aarch64__ + +void direct_conv_colmajor(const convMat_t &bottom_blob, convMat_t &top_blob, + const convMat_t &kernel, const convParams_t ¶ms, int num_threads) +{ + omp_set_num_threads(num_threads); + + if (bottom_blob.c * top_blob.c < 256 * 256) + { + direct_conv_s(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h, + params.pad_w); + return; + } + + direct_conv_l(bottom_blob, top_blob, kernel, params.stride_w, params.padding, params.pad_h, + params.pad_w); +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/direct_conv_colmajor.h b/compute/ncnn/src/srcn/direct_conv_colmajor.h new file mode 100644 index 000000000..5e15192c9 --- /dev/null +++ b/compute/ncnn/src/srcn/direct_conv_colmajor.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__ +#define __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__ + +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +void direct_conv_colmajor(const convMat_t &, convMat_t &, const convMat_t &, const convParams_t &, + int); + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_DIRECT_CONV_COLMAJOR_H__ diff --git a/compute/ncnn/src/srcn/sgemm_kernel.cc b/compute/ncnn/src/srcn/sgemm_kernel.cc new file mode 100644 index 000000000..90c3641db --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_kernel.cc @@ -0,0 +1,2508 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <arm_neon.h> + +namespace nnfw +{ +namespace srcn +{ + +#if __aarch64__ +static void sgemm_rowmajor_micro_kernel_8x12(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int oddk = (k & 1); + int nk = ((k + 1) / 2) - 1; + + const int nstride = stride << 2; + + __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov x0, %[res_ptr]\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v29.4s, v30.4s, v31.4s}, [x0]\n" + "cbz %w[nk], 4f\n" + "b 1f\n" + + "0:\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "cbz %w[nk], 4f\n" + + "1:\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + + "fmla v8.4s, v5.4s, v0.s[0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v14.4s, v5.4s, v0.s[2]\n" + "fmla v17.4s, v5.4s, v0.s[3]\n" + "fmla v9.4s, v6.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v15.4s, v6.4s, v0.s[2]\n" + "fmla v18.4s, v6.4s, v0.s[3]\n" + "fmla v10.4s, v7.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "fmla v16.4s, v7.4s, v0.s[2]\n" + "fmla v19.4s, v7.4s, v0.s[3]\n" + + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + + "fmla v20.4s, v5.4s, v1.s[0]\n" + "fmla v23.4s, v5.4s, v1.s[1]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v26.4s, v5.4s, v1.s[2]\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v21.4s, v6.4s, v1.s[0]\n" + "fmla v24.4s, v6.4s, v1.s[1]\n" + "fmla v27.4s, v6.4s, v1.s[2]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v22.4s, v7.4s, v1.s[0]\n" + "fmla v25.4s, v7.4s, v1.s[1]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v28.4s, v7.4s, v1.s[2]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "bne 1b\n" + + "4:\n" + "mov x0, %[res_ptr]\n" + "cbnz %[oddk], 2f\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v5.4s, v0.s[0]\n" + "fmla v9.4s, v6.4s, v0.s[0]\n" + "fmla v10.4s, v7.4s, v0.s[0]\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v12.4s, v6.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v0.s[1]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v6.4s, v0.s[2]\n" + "fmla v16.4s, v7.4s, v0.s[2]\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v17.4s, v5.4s, v0.s[3]\n" + "fmla v18.4s, v6.4s, v0.s[3]\n" + "fmla v19.4s, v7.4s, v0.s[3]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v20.4s, v5.4s, v1.s[0]\n" + "fmla v21.4s, v6.4s, v1.s[0]\n" + "fmla v22.4s, v7.4s, v1.s[0]\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v23.4s, v5.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v1.s[1]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v5.4s, v1.s[2]\n" + "fmla v27.4s, v6.4s, v1.s[2]\n" + "fmla v28.4s, v7.4s, v1.s[2]\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "b 3f\n" + + "2:\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v11.4s, v2.4s, v0.s[1]\n" + "fmla v12.4s, v3.4s, v0.s[1]\n" + "fmla v13.4s, v4.4s, v0.s[1]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v2.4s, v0.s[2]\n" + "fmla v15.4s, v3.4s, v0.s[2]\n" + "fmla v16.4s, v4.4s, v0.s[2]\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v17.4s, v2.4s, v0.s[3]\n" + "fmla v18.4s, v3.4s, v0.s[3]\n" + "fmla v19.4s, v4.4s, v0.s[3]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v20.4s, v2.4s, v1.s[0]\n" + "fmla v21.4s, v3.4s, v1.s[0]\n" + "fmla v22.4s, v4.4s, v1.s[0]\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v23.4s, v2.4s, v1.s[1]\n" + "fmla v24.4s, v3.4s, v1.s[1]\n" + "fmla v25.4s, v4.4s, v1.s[1]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v2.4s, v1.s[2]\n" + "fmla v27.4s, v3.4s, v1.s[2]\n" + "fmla v28.4s, v4.4s, v1.s[2]\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v29.4s, v2.4s, v1.s[3]\n" + "fmla v30.4s, v3.4s, v1.s[3]\n" + "fmla v31.4s, v4.4s, v1.s[3]\n" + + "3:\n" + "st1 {v29.4s, v30.4s, v31.4s}, [x0]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk) + : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +static void sgemm_rowmajor_micro_kernel_12x8(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int oddk = (k & 1); + int nk = ((k + 1) / 2) - 1; + + const int nstride = stride << 2; + + __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v4.4s, v5.4s}, [%[rhs_ptr]], #32\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov x0, %[res_ptr]\n" + "ld1 {v8.4s, v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v10.4s, v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v14.4s, v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v16.4s, v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v20.4s, v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v22.4s, v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v26.4s, v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v28.4s, v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v30.4s, v31.4s}, [x0]\n" + "cbz %w[nk], 4f\n" + "b 1f\n" + + "0:\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "cbz %w[nk], 4f\n" + + "1:\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v0.s[3]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v15.4s, v5.4s, v0.s[3]\n" + + "fmla v16.4s, v4.4s, v1.s[0]\n" + "fmla v18.4s, v4.4s, v1.s[1]\n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" + "fmla v20.4s, v4.4s, v1.s[2]\n" + "fmla v22.4s, v4.4s, v1.s[3]\n" + "fmla v17.4s, v5.4s, v1.s[0]\n" + "fmla v19.4s, v5.4s, v1.s[1]\n" + "fmla v21.4s, v5.4s, v1.s[2]\n" + "fmla v23.4s, v5.4s, v1.s[3]\n" + + "ld1 {v6.4s, v7.4s}, [%[rhs_ptr]], #32\n" + + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v26.4s, v4.4s, v2.s[1]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v28.4s, v4.4s, v2.s[2]\n" + "fmla v30.4s, v4.4s, v2.s[3]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "fmla v27.4s, v5.4s, v2.s[1]\n" + "fmla v29.4s, v5.4s, v2.s[2]\n" + "fmla v31.4s, v5.4s, v2.s[3]\n" + + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v12.4s, v6.4s, v0.s[2]\n" + "fmla v14.4s, v6.4s, v0.s[3]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "fmla v13.4s, v7.4s, v0.s[2]\n" + "fmla v15.4s, v7.4s, v0.s[3]\n" + + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" + "fmla v20.4s, v6.4s, v1.s[2]\n" + "fmla v22.4s, v6.4s, v1.s[3]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v21.4s, v7.4s, v1.s[2]\n" + "fmla v23.4s, v7.4s, v1.s[3]\n" + + "ld1 {v4.4s, v5.4s}, [%[rhs_ptr]], #32\n" + + "fmla v24.4s, v6.4s, v2.s[0]\n" + "fmla v26.4s, v6.4s, v2.s[1]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v28.4s, v6.4s, v2.s[2]\n" + "fmla v30.4s, v6.4s, v2.s[3]\n" + "fmla v25.4s, v7.4s, v2.s[0]\n" + "fmla v27.4s, v7.4s, v2.s[1]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v29.4s, v7.4s, v2.s[2]\n" + "fmla v31.4s, v7.4s, v2.s[3]\n" + "bne 1b\n" + + "4:\n" + "mov x0, %[res_ptr]\n" + "cbnz %[oddk], 2f\n" + + "fmla v8.4s, v4.4s, v0.s[0]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "fmla v14.4s, v4.4s, v0.s[3]\n" + "fmla v15.4s, v5.4s, v0.s[3]\n" + + "fmla v16.4s, v4.4s, v1.s[0]\n" + "fmla v17.4s, v5.4s, v1.s[0]\n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" + "fmla v18.4s, v4.4s, v1.s[1]\n" + "fmla v19.4s, v5.4s, v1.s[1]\n" + "fmla v20.4s, v4.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v1.s[2]\n" + "fmla v22.4s, v4.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v1.s[3]\n" + + "ld1 {v6.4s, v7.4s}, [%[rhs_ptr]], #32\n" + + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "fmla v26.4s, v4.4s, v2.s[1]\n" + "fmla v27.4s, v5.4s, v2.s[1]\n" + "fmla v28.4s, v4.4s, v2.s[2]\n" + "fmla v29.4s, v5.4s, v2.s[2]\n" + "fmla v30.4s, v4.4s, v2.s[3]\n" + "fmla v31.4s, v5.4s, v2.s[3]\n" + + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v9.4s, v7.4s, v0.s[0]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "st1 {v8.4s, v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v10.4s, v6.4s, v0.s[1]\n" + "fmla v11.4s, v7.4s, v0.s[1]\n" + "st1 {v10.4s, v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v12.4s, v6.4s, v0.s[2]\n" + "fmla v13.4s, v7.4s, v0.s[2]\n" + "st1 {v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v6.4s, v0.s[3]\n" + "fmla v15.4s, v7.4s, v0.s[3]\n" + "st1 {v14.4s, v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v16.4s, v6.4s, v1.s[0]\n" + "fmla v17.4s, v7.4s, v1.s[0]\n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" + "st1 {v16.4s, v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "st1 {v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v20.4s, v6.4s, v1.s[2]\n" + "fmla v21.4s, v7.4s, v1.s[2]\n" + "st1 {v20.4s, v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v22.4s, v6.4s, v1.s[3]\n" + "fmla v23.4s, v7.4s, v1.s[3]\n" + "st1 {v22.4s, v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v24.4s, v6.4s, v2.s[0]\n" + "fmla v25.4s, v7.4s, v2.s[0]\n" + "st1 {v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v6.4s, v2.s[1]\n" + "fmla v27.4s, v7.4s, v2.s[1]\n" + "st1 {v26.4s, v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v28.4s, v6.4s, v2.s[2]\n" + "fmla v29.4s, v7.4s, v2.s[2]\n" + "st1 {v28.4s, v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v30.4s, v6.4s, v2.s[3]\n" + "fmla v31.4s, v7.4s, v2.s[3]\n" + "b 3f\n" + + "2:\n" + "fmla v8.4s, v4.4s, v0.s[0]\n" + "fmla v9.4s, v5.4s, v0.s[0]\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "st1 {v8.4s, v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v10.4s, v4.4s, v0.s[1]\n" + "fmla v11.4s, v5.4s, v0.s[1]\n" + "st1 {v10.4s, v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v12.4s, v4.4s, v0.s[2]\n" + "fmla v13.4s, v5.4s, v0.s[2]\n" + "st1 {v12.4s, v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v4.4s, v0.s[3]\n" + "fmla v15.4s, v5.4s, v0.s[3]\n" + "st1 {v14.4s, v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v16.4s, v4.4s, v1.s[0]\n" + "fmla v17.4s, v5.4s, v1.s[0]\n" + "ld1 {v2.4s}, [%[lhs_ptr]], #16\n" + "st1 {v16.4s, v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v18.4s, v4.4s, v1.s[1]\n" + "fmla v19.4s, v5.4s, v1.s[1]\n" + "st1 {v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v20.4s, v4.4s, v1.s[2]\n" + "fmla v21.4s, v5.4s, v1.s[2]\n" + "st1 {v20.4s, v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v22.4s, v4.4s, v1.s[3]\n" + "fmla v23.4s, v5.4s, v1.s[3]\n" + "st1 {v22.4s, v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + + "fmla v24.4s, v4.4s, v2.s[0]\n" + "fmla v25.4s, v5.4s, v2.s[0]\n" + "st1 {v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v4.4s, v2.s[1]\n" + "fmla v27.4s, v5.4s, v2.s[1]\n" + "st1 {v26.4s, v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v28.4s, v4.4s, v2.s[2]\n" + "fmla v29.4s, v5.4s, v2.s[2]\n" + "st1 {v28.4s, v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v30.4s, v4.4s, v2.s[3]\n" + "fmla v31.4s, v5.4s, v2.s[3]\n" + + "3:\n" + "st1 {v30.4s, v31.4s}, [x0]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk) + : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride) + : "x0", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +#ifdef BATCH_DILATION_FIX +static void sgemm_rowmajor_micro_kernel_4x24(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int oddk = (k & 1); + int nk = ((k + 1) / 2) - 1; + + const int nstride = stride << 2; + + __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov x0, %[res_ptr]\n" + "mov x1, x0\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "ld1 {v29.4s, v30.4s, v31.4s}, [x1]\n" + "cbz %w[nk], 4f\n" + "b 1f\n" + + "0:\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "cbz %w[nk], 4f\n" + + "1:\n" + "mov x0, v0.d[0]\n" + "cmp x0, #0\n" + "bne 5f\n" + "mov x0, v0.d[1]\n" + "cmp x0, #0\n" + "bne 5f\n" + "add %[rhs_ptr], %[rhs_ptr], #96\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "b 6f\n" + "5:\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + + "6:\n" + "mov x0, v1.d[0]\n" + "cmp x0, #0\n" + "bne 7f\n" + "mov x0, v1.d[1]\n" + "cmp x0, #0\n" + "bne 7f\n" + "add %[rhs_ptr], %[rhs_ptr], #96\n" + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "b 8f\n" + "7:\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "fmla v8.4s, v2.4s, v1.s[0]\n" + "fmla v14.4s, v2.4s, v1.s[1]\n" + "fmla v20.4s, v2.4s, v1.s[2]\n" + "fmla v26.4s, v2.4s, v1.s[3]\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + "fmla v9.4s, v3.4s, v1.s[0]\n" + "fmla v15.4s, v3.4s, v1.s[1]\n" + "fmla v21.4s, v3.4s, v1.s[2]\n" + "fmla v27.4s, v3.4s, v1.s[3]\n" + "fmla v10.4s, v4.4s, v1.s[0]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v1.s[2]\n" + "fmla v28.4s, v4.4s, v1.s[3]\n" + + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + + "fmla v11.4s, v5.4s, v1.s[0]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v1.s[2]\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v24.4s, v6.4s, v1.s[2]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "fmla v25.4s, v7.4s, v1.s[2]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + + "8:\n" + "subs %w[nk], %w[nk], #1\n" + "bne 1b\n" + + "4:\n" + "mov x0, %[res_ptr]\n" + "cbnz %[oddk], 2f\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v1.s[0]\n" + "fmla v9.4s, v3.4s, v1.s[0]\n" + "fmla v10.4s, v4.4s, v1.s[0]\n" + "mov x1, x0\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "fmla v11.4s, v5.4s, v1.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "fmla v14.4s, v2.4s, v1.s[1]\n" + "fmla v15.4s, v3.4s, v1.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "fmla v20.4s, v2.4s, v1.s[2]\n" + "fmla v21.4s, v3.4s, v1.s[2]\n" + "fmla v22.4s, v4.4s, v1.s[2]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "fmla v23.4s, v5.4s, v1.s[2]\n" + "fmla v24.4s, v6.4s, v1.s[2]\n" + "fmla v25.4s, v7.4s, v1.s[2]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "fmla v26.4s, v2.4s, v1.s[3]\n" + "fmla v27.4s, v3.4s, v1.s[3]\n" + "fmla v28.4s, v4.4s, v1.s[3]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "b 3f\n" + + "2:\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "mov x1, x0\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + "3:\n" + "st1 {v29.4s, v30.4s, v31.4s}, [x1]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk) + : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride) + : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} +#else // BATCH_DILATION_FIX +static void sgemm_rowmajor_micro_kernel_4x24(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int oddk = (k & 1); + int nk = ((k + 1) / 2) - 1; + + const int nstride = stride << 2; + + __asm __volatile("ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v2.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v4.4s}, [%[rhs_ptr]], #16\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov x0, %[res_ptr]\n" + "mov x1, x0\n" + "ld1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "ld1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "ld1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "ld1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "ld1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "ld1 {v29.4s, v30.4s, v31.4s}, [x1]\n" + "cbz %w[nk], 4f\n" + "b 1f\n" + + "0:\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "cbz %w[nk], 4f\n" + + "1:\n" + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + + "fmla v8.4s, v2.4s, v1.s[0]\n" + "fmla v14.4s, v2.4s, v1.s[1]\n" + "fmla v20.4s, v2.4s, v1.s[2]\n" + "fmla v26.4s, v2.4s, v1.s[3]\n" + "fmla v9.4s, v3.4s, v1.s[0]\n" + "fmla v15.4s, v3.4s, v1.s[1]\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + "fmla v21.4s, v3.4s, v1.s[2]\n" + "fmla v27.4s, v3.4s, v1.s[3]\n" + "fmla v10.4s, v4.4s, v1.s[0]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "fmla v22.4s, v4.4s, v1.s[2]\n" + "fmla v28.4s, v4.4s, v1.s[3]\n" + + "ld1 {v0.4s}, [%[lhs_ptr]], #16\n" + + "fmla v11.4s, v5.4s, v1.s[0]\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v23.4s, v5.4s, v1.s[2]\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + "fmla v24.4s, v6.4s, v1.s[2]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v25.4s, v7.4s, v1.s[2]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "bne 1b\n" + + "4:\n" + "mov x0, %[res_ptr]\n" + "cbnz %[oddk], 2f\n" + "ld1 {v1.4s}, [%[lhs_ptr]], #16\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + + "ld1 {v2.4s, v3.4s, v4.4s}, [%[rhs_ptr]], #48\n" + + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v1.s[0]\n" + "fmla v9.4s, v3.4s, v1.s[0]\n" + "fmla v10.4s, v4.4s, v1.s[0]\n" + "mov x1, x0\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "fmla v11.4s, v5.4s, v1.s[0]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[0]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "fmla v14.4s, v2.4s, v1.s[1]\n" + "fmla v15.4s, v3.4s, v1.s[1]\n" + "fmla v16.4s, v4.4s, v1.s[1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "fmla v17.4s, v5.4s, v1.s[1]\n" + "fmla v18.4s, v6.4s, v1.s[1]\n" + "fmla v19.4s, v7.4s, v1.s[1]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "fmla v20.4s, v2.4s, v1.s[2]\n" + "fmla v21.4s, v3.4s, v1.s[2]\n" + "fmla v22.4s, v4.4s, v1.s[2]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "fmla v23.4s, v5.4s, v1.s[2]\n" + "fmla v24.4s, v6.4s, v1.s[2]\n" + "fmla v25.4s, v7.4s, v1.s[2]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "fmla v26.4s, v2.4s, v1.s[3]\n" + "fmla v27.4s, v3.4s, v1.s[3]\n" + "fmla v28.4s, v4.4s, v1.s[3]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "fmla v29.4s, v5.4s, v1.s[3]\n" + "fmla v30.4s, v6.4s, v1.s[3]\n" + "fmla v31.4s, v7.4s, v1.s[3]\n" + "b 3f\n" + + "2:\n" + "ld1 {v5.4s, v6.4s, v7.4s}, [%[rhs_ptr]], #48\n" + + "fmla v8.4s, v2.4s, v0.s[0]\n" + "fmla v9.4s, v3.4s, v0.s[0]\n" + "fmla v10.4s, v4.4s, v0.s[0]\n" + "mov x1, x0\n" + "st1 {v8.4s, v9.4s, v10.4s}, [x1], #48\n" + "fmla v11.4s, v5.4s, v0.s[0]\n" + "fmla v12.4s, v6.4s, v0.s[0]\n" + "fmla v13.4s, v7.4s, v0.s[0]\n" + "st1 {v11.4s, v12.4s, v13.4s}, [x1]\n" + "fmla v14.4s, v2.4s, v0.s[1]\n" + "fmla v15.4s, v3.4s, v0.s[1]\n" + "fmla v16.4s, v4.4s, v0.s[1]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v14.4s, v15.4s, v16.4s}, [x1], #48\n" + "fmla v17.4s, v5.4s, v0.s[1]\n" + "fmla v18.4s, v6.4s, v0.s[1]\n" + "fmla v19.4s, v7.4s, v0.s[1]\n" + "st1 {v17.4s, v18.4s, v19.4s}, [x1]\n" + "fmla v20.4s, v2.4s, v0.s[2]\n" + "fmla v21.4s, v3.4s, v0.s[2]\n" + "fmla v22.4s, v4.4s, v0.s[2]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v20.4s, v21.4s, v22.4s}, [x1], #48\n" + "fmla v23.4s, v5.4s, v0.s[2]\n" + "fmla v24.4s, v6.4s, v0.s[2]\n" + "fmla v25.4s, v7.4s, v0.s[2]\n" + "st1 {v23.4s, v24.4s, v25.4s}, [x1]\n" + "fmla v26.4s, v2.4s, v0.s[3]\n" + "fmla v27.4s, v3.4s, v0.s[3]\n" + "fmla v28.4s, v4.4s, v0.s[3]\n" + "add x0, x0, %[nstride]\n" + "mov x1, x0\n" + "st1 {v26.4s, v27.4s, v28.4s}, [x1], #48\n" + "fmla v29.4s, v5.4s, v0.s[3]\n" + "fmla v30.4s, v6.4s, v0.s[3]\n" + "fmla v31.4s, v7.4s, v0.s[3]\n" + "3:\n" + "st1 {v29.4s, v30.4s, v31.4s}, [x1]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk) + : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride) + : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} +#endif // BATCH_DILATION_FIX + +static void sgemm_rowmajor_micro_kernel_24x4(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int oddk = (k & 1); + int nk = ((k + 1) / 2) - 1; + + const int nstride = stride << 2; + + __asm __volatile("ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n" + "ld1 {v6.4s}, [%[rhs_ptr]], #16\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov x0, %[res_ptr]\n" + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v14.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v18.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v20.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v24.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v26.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v30.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "ld1 {v31.4s}, [x0]\n" + "cbz %w[nk], 4f\n" + "b 1f\n" + + "0:\n" + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + "cbz %w[nk], 4f\n" + + "1:\n" + "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v9.4s, v6.4s, v0.s[1]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v11.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v13.4s, v6.4s, v1.s[1]\n" + "ld1 {v7.4s}, [%[rhs_ptr]], #16\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v15.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v17.4s, v6.4s, v2.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v19.4s, v6.4s, v2.s[3]\n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v21.4s, v6.4s, v3.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v23.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "fmla v25.4s, v6.4s, v4.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v27.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "fmla v29.4s, v6.4s, v5.s[1]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "fmla v31.4s, v6.4s, v5.s[3]\n" + + "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n" + "fmla v8.4s, v7.4s, v0.s[0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "fmla v10.4s, v7.4s, v0.s[2]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "fmla v12.4s, v7.4s, v1.s[0]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "ld1 {v6.4s}, [%[rhs_ptr]], #16\n" + "fmla v14.4s, v7.4s, v1.s[2]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "fmla v16.4s, v7.4s, v2.s[0]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "fmla v18.4s, v7.4s, v2.s[2]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n" + "fmla v20.4s, v7.4s, v3.s[0]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "fmla v22.4s, v7.4s, v3.s[2]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "fmla v24.4s, v7.4s, v4.s[0]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "fmla v26.4s, v7.4s, v4.s[2]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "fmla v28.4s, v7.4s, v5.s[0]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v30.4s, v7.4s, v5.s[2]\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "bne 1b\n" + + "4:\n" + "mov x0, %[res_ptr]\n" + "cbnz %[oddk], 2f\n" + + "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v9.4s, v6.4s, v0.s[1]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "fmla v11.4s, v6.4s, v0.s[3]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "fmla v13.4s, v6.4s, v1.s[1]\n" + "ld1 {v7.4s}, [%[rhs_ptr]], #16\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "fmla v15.4s, v6.4s, v1.s[3]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "fmla v17.4s, v6.4s, v2.s[1]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "fmla v19.4s, v6.4s, v2.s[3]\n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%[lhs_ptr]], #48\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "fmla v21.4s, v6.4s, v3.s[1]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "fmla v23.4s, v6.4s, v3.s[3]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "fmla v25.4s, v6.4s, v4.s[1]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "fmla v27.4s, v6.4s, v4.s[3]\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "fmla v29.4s, v6.4s, v5.s[1]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "fmla v31.4s, v6.4s, v5.s[3]\n" + + "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n" + "fmla v8.4s, v7.4s, v0.s[0]\n" + "fmla v9.4s, v7.4s, v0.s[1]\n" + "st1 {v8.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v10.4s, v7.4s, v0.s[2]\n" + "st1 {v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v11.4s, v7.4s, v0.s[3]\n" + "st1 {v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v12.4s, v7.4s, v1.s[0]\n" + "st1 {v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v13.4s, v7.4s, v1.s[1]\n" + "st1 {v12.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v7.4s, v1.s[2]\n" + "st1 {v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v15.4s, v7.4s, v1.s[3]\n" + "st1 {v14.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v16.4s, v7.4s, v2.s[0]\n" + "st1 {v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v17.4s, v7.4s, v2.s[1]\n" + "st1 {v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v18.4s, v7.4s, v2.s[2]\n" + "st1 {v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v19.4s, v7.4s, v2.s[3]\n" + "st1 {v18.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v20.4s, v7.4s, v3.s[0]\n" + "st1 {v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v21.4s, v7.4s, v3.s[1]\n" + "st1 {v20.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v22.4s, v7.4s, v3.s[2]\n" + "st1 {v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v23.4s, v7.4s, v3.s[3]\n" + "st1 {v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v24.4s, v7.4s, v4.s[0]\n" + "st1 {v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v25.4s, v7.4s, v4.s[1]\n" + "st1 {v24.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v7.4s, v4.s[2]\n" + "st1 {v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v27.4s, v7.4s, v4.s[3]\n" + "st1 {v26.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v28.4s, v7.4s, v5.s[0]\n" + "st1 {v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v29.4s, v7.4s, v5.s[1]\n" + "st1 {v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v30.4s, v7.4s, v5.s[2]\n" + "st1 {v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v31.4s, v7.4s, v5.s[3]\n" + "st1 {v30.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "b 3f\n" + + "2:\n" + "ld1 {v3.4s, v4.4s, v5.4s}, [%[lhs_ptr]], #48\n" + "fmla v8.4s, v6.4s, v0.s[0]\n" + "fmla v9.4s, v6.4s, v0.s[1]\n" + "st1 {v8.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v10.4s, v6.4s, v0.s[2]\n" + "st1 {v9.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v11.4s, v6.4s, v0.s[3]\n" + "st1 {v10.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v12.4s, v6.4s, v1.s[0]\n" + "st1 {v11.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v13.4s, v6.4s, v1.s[1]\n" + "st1 {v12.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v14.4s, v6.4s, v1.s[2]\n" + "st1 {v13.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v15.4s, v6.4s, v1.s[3]\n" + "st1 {v14.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v16.4s, v6.4s, v2.s[0]\n" + "st1 {v15.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v17.4s, v6.4s, v2.s[1]\n" + "st1 {v16.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v18.4s, v6.4s, v2.s[2]\n" + "st1 {v17.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v19.4s, v6.4s, v2.s[3]\n" + "st1 {v18.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v20.4s, v6.4s, v3.s[0]\n" + "st1 {v19.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v21.4s, v6.4s, v3.s[1]\n" + "st1 {v20.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v22.4s, v6.4s, v3.s[2]\n" + "st1 {v21.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v23.4s, v6.4s, v3.s[3]\n" + "st1 {v22.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v24.4s, v6.4s, v4.s[0]\n" + "st1 {v23.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v25.4s, v6.4s, v4.s[1]\n" + "st1 {v24.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v26.4s, v6.4s, v4.s[2]\n" + "st1 {v25.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v27.4s, v6.4s, v4.s[3]\n" + "st1 {v26.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v28.4s, v6.4s, v5.s[0]\n" + "st1 {v27.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v29.4s, v6.4s, v5.s[1]\n" + "st1 {v28.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v30.4s, v6.4s, v5.s[2]\n" + "st1 {v29.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "fmla v31.4s, v6.4s, v5.s[3]\n" + "st1 {v30.4s}, [x0]\n" + "add x0, x0, %[nstride]\n" + "3:\n" + "st1 {v31.4s}, [x0]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk) + : [oddk] "r"(oddk), [k0] "r"(k0), [nstride] "r"(nstride) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +#else // __aarch64__ +static void sgemm_rowmajor_micro_kernel_6x8(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int nk = k >> 2; + int rk = k & 3; + + const int nstride = stride << 2; + + if (rk == 0) + { + nk--; + rk = 4; + } + + __asm __volatile("vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov r0, %[res_ptr]\n" + + "vld1.f32 {d8-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d12-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d16-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d20-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d24-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d28-d31}, [r0]\n" + "b 1f\n" + + "0:\n" + "vmov.i32 q4, #0\n" + "vmov.i32 q5, #0\n" + "vmov.i32 q6, #0\n" + "pld [%[lhs_ptr], #48]\n" + "vmov.i32 q7, #0\n" + "pld [%[rhs_ptr], #48]\n" + "vmov.i32 q8, #0\n" + "pld [%[lhs_ptr], #112]\n" + "vmov.i32 q9, #0\n" + "pld [%[rhs_ptr], #112]\n" + "vmov.i32 q10, #0\n" + "vmov.i32 q11, #0\n" + "vmov.i32 q12, #0\n" + "vmov.i32 q13, #0\n" + "pld [%[lhs_ptr], #176]\n" + "vmov.i32 q14, #0\n" + "pld [%[rhs_ptr], #176]\n" + "vmov.i32 q15, #0\n" + + "1:\n" + "cmp %[nk], #0\n" + "beq 6f\n" + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q14, q2, d2[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + + "vmla.f32 q4, q2, d3[0]\n" + "subs %[nk], %[nk], #1\n" + "vmla.f32 q6, q2, d3[1]\n" + "pld [%[lhs_ptr], #208]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q10, q2, d0[1]\n" + "pld [%[rhs_ptr], #192]\n" + "vmla.f32 q12, q2, d1[0]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q9, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q13, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + "pld [%[lhs_ptr], #240]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q14, q2, d0[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "pld [%[rhs_ptr], #208]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q13, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q6, q2, d1[1]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q10, q2, d2[1]\n" + "vmla.f32 q12, q2, d3[0]\n" + "vmla.f32 q14, q2, d3[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d1[0]\n" + "vmla.f32 q7, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q9, q3, d2[0]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vmla.f32 q13, q3, d3[0]\n" + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + "6:\n" + "mov r0, %[res_ptr]\n" + "subs %[rk], %[rk], #1\n" + "beq 3f\n" + + "vmla.f32 q4, q2, d0[0]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q12, q2, d2[0]\n" + "subs %[rk], %[rk], #1\n" + "vmla.f32 q14, q2, d2[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d0[0]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vmla.f32 q9, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q13, q3, d2[0]\n" + "vmla.f32 q15, q3, d2[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "beq 4f\n" + + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q6, q2, d3[1]\n" + "subs %[rk], %[rk], #1\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q10, q2, d0[1]\n" + "vmla.f32 q12, q2, d1[0]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q9, q3, d0[0]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vmla.f32 q13, q3, d1[0]\n" + "vmla.f32 q15, q3, d1[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "beq 5f\n" + + "vld1.32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q10, q2, d3[1]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q14, q2, d0[1]\n" + "vld1.32 {d4-d5}, [%[rhs_ptr]]!\n" + + "vmla.f32 q5, q3, d2[0]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vld1.32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q13, q3, d0[0]\n" + "vmla.f32 q15, q3, d0[1]\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + + "vmla.f32 q4, q2, d1[0]\n" + "vmla.f32 q5, q3, d1[0]\n" + "vst1.32 {d8-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q2, d1[1]\n" + "vmla.f32 q7, q3, d1[1]\n" + "vst1.32 {d12-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q3, d2[0]\n" + "vst1.32 {d16-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q2, d2[1]\n" + "vmla.f32 q11, q3, d2[1]\n" + "vst1.32 {d20-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q2, d3[0]\n" + "vmla.f32 q13, q3, d3[0]\n" + "vst1.32 {d24-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q2, d3[1]\n" + "vmla.f32 q15, q3, d3[1]\n" + "b 2f\n" + + "3:\n" + "vld1.32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q3, d0[0]\n" + "vst1.32 {d8-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q2, d0[1]\n" + "vmla.f32 q7, q3, d0[1]\n" + "vst1.32 {d12-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q8, q2, d1[0]\n" + "vld1.32 {d2}, [%[lhs_ptr]]!\n" + "vmla.f32 q9, q3, d1[0]\n" + "vst1.32 {d16-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q2, d1[1]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.32 {d20-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q2, d2[0]\n" + "vmla.f32 q13, q3, d2[0]\n" + "vst1.32 {d24-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q2, d2[1]\n" + "vmla.f32 q15, q3, d2[1]\n" + "b 2f\n" + + "4:\n" + "vmla.f32 q4, q2, d3[0]\n" + "vmla.f32 q5, q3, d3[0]\n" + "vst1.32 {d8-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q2, d3[1]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vst1.32 {d12-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q8, q2, d0[0]\n" + "vmla.f32 q9, q3, d0[0]\n" + "vst1.32 {d16-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q2, d0[1]\n" + "vmla.f32 q11, q3, d0[1]\n" + "vst1.32 {d20-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q2, d1[0]\n" + "vmla.f32 q13, q3, d1[0]\n" + "vst1.32 {d24-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q2, d1[1]\n" + "vmla.f32 q15, q3, d1[1]\n" + "b 2f\n" + + "5:\n" + "vld1.32 {d0}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q2, d2[0]\n" + "vmla.f32 q5, q3, d2[0]\n" + "vst1.32 {d8-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q2, d2[1]\n" + "vmla.f32 q7, q3, d2[1]\n" + "vst1.32 {d12-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q8, q2, d3[0]\n" + "vmla.f32 q9, q3, d3[0]\n" + "vst1.32 {d16-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q2, d3[1]\n" + "vmla.f32 q11, q3, d3[1]\n" + "vst1.32 {d20-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q13, q3, d0[0]\n" + "vst1.32 {d24-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q2, d0[1]\n" + "vmla.f32 q15, q3, d0[1]\n" + "2:\n" + "vst1.32 {d28-d31}, [r0]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk), [rk] "+r"(rk) + : [k0] "r"(k0), [nstride] "r"(nstride) + : "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "q12", "q13", "q14", "q15", "cc"); +} + +static void sgemm_rowmajor_micro_kernel_4x12(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int rk = (k & 1); + int nk = (k + 1) / 2; + + const int nstride = stride << 2; + + asm volatile("vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov r1, %[res_ptr]\n" + + "subs %[nk], %[nk], #1\n" + "mov r0, r1\n" + "vld1.f32 {d8-d9}, [r0]!\n" + "add r1, %[nstride]\n" + "vld1.f32 {d16-d17}, [r0]!\n" + "vld1.f32 {d24-d25}, [r0]\n" + "mov r0, r1\n" + "vld1.f32 {d10-d11}, [r0]!\n" + "add r1, %[nstride]\n" + "vld1.f32 {d18-d19}, [r0]!\n" + "vld1.f32 {d26-d27}, [r0]\n" + "mov r0, r1\n" + "vld1.f32 {d12-d13}, [r0]!\n" + "add r1, %[nstride]\n" + "vld1.f32 {d20-d21}, [r0]!\n" + "vld1.f32 {d28-d29}, [r0]\n" + "mov r0, r1\n" + "vld1.f32 {d14-d15}, [r0]!\n" + "vld1.f32 {d22-d23}, [r0]!\n" + "vld1.f32 {d30-d31}, [r0]\n" + "beq 2f\n" + + "b 1f\n" + + "0:\n" + "veor q4, q4\n" + "subs %[nk],%[nk], #1\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q12, q4\n" + "vmov.f32 q5, q4\n" + "vmov.f32 q9, q4\n" + "vmov.f32 q13, q4\n" + "vmov.f32 q6, q4\n" + "vmov.f32 q10, q4\n" + "vmov.f32 q14, q4\n" + "vmov.f32 q7, q4\n" + "vmov.f32 q11, q4\n" + "vmov.f32 q15, q4\n" + + "beq 2f\n" + + "1:\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vmla.f32 q8, q3, d0[0]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q10, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q13, q2, d0[1]\n" + "pld [%[lhs_ptr], #208]\n" + "vmla.f32 q14, q2, d1[0]\n" + "pld [%[rhs_ptr], #192]\n" + "vmla.f32 q15, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vmla.f32 q4, q3, d2[0]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vmla.f32 q6, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q10, q2, d3[0]\n" + "vmla.f32 q11, q2, d3[1]\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vmla.f32 q13, q3, d2[1]\n" + "subs %[nk],%[nk], #1\n" + "pld [%[lhs_ptr], #240]\n" + "vmla.f32 q14, q3, d3[0]\n" + "pld [%[rhs_ptr], #208]\n" + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + "2:\n" + "cmp %[rk], #1\n" + "beq 3f\n" + + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vmla.f32 q8, q3, d0[0]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q13, q2, d0[1]\n" + "vmla.f32 q14, q2, d1[0]\n" + "vmla.f32 q15, q2, d1[1]\n" + + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vld1.f32 {d0-d1}, [%[rhs_ptr]]!\n" + "mov r1, %[res_ptr]\n" + "mov r0, r1\n" + "vmla.f32 q4, q3, d2[0]\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q12, q0, d2[0]\n" + "vst1.f32 {d8-d9}, [r0]!\n" + "add r1, %[nstride]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vst1.f32 {d16-d17}, [r0]!\n" + "vmla.f32 q9, q2, d2[1]\n" + "vst1.f32 {d24-d25}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q13, q0, d2[1]\n" + "vst1.f32 {d10-d11}, [r0]!\n" + "vmla.f32 q6, q3, d3[0]\n" + "add r1, %[nstride]\n" + "vst1.f32 {d18-d19}, [r0]!\n" + "vmla.f32 q10, q2, d3[0]\n" + "vst1.f32 {d26-d27}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q14, q0, d3[0]\n" + "vst1.f32 {d12-d13}, [r0]!\n" + "add r1, %[nstride]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vst1.f32 {d20-d21}, [r0]!\n" + "vmla.f32 q11, q2, d3[1]\n" + "vst1.f32 {d28-d29}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q15, q0, d3[1]\n" + "b 4f\n" + + "3:\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n" + "mov r1, %[res_ptr]\n" + "mov r0, r1\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q8, q3, d0[0]\n" + "vmla.f32 q12, q1, d0[0]\n" + "vst1.f32 {d8-d9}, [r0]!\n" + "add r1, %[nstride]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vst1.f32 {d16-d17}, [r0]!\n" + "vmla.f32 q9, q3, d0[1]\n" + "vst1.f32 {d24-d25}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q13, q1, d0[1]\n" + "vst1.f32 {d10-d11}, [r0]!\n" + "vmla.f32 q6, q2, d1[0]\n" + "add r1, %[nstride]\n" + "vst1.f32 {d18-d19}, [r0]!\n" + "vmla.f32 q10, q3, d1[0]\n" + "vst1.f32 {d26-d27}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q14, q1, d1[0]\n" + "vst1.f32 {d12-d13}, [r0]!\n" + "add r1, %[nstride]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vst1.f32 {d20-d21}, [r0]!\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.f32 {d28-d29}, [r0]\n" + "mov r0, r1\n" + "vmla.f32 q15, q1, d1[1]\n" + + "4:\n" + "vst1.f32 {d14-d15}, [r0]!\n" + "vst1.f32 {d22-d23}, [r0]!\n" + "vst1.f32 {d30-d31}, [r0]\n" + + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk), [rk] "+r"(rk) + : [k0] "r"(k0), [nstride] "r"(nstride) + : "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11", "q12", "q13", "q14", "q15", "cc"); +} + +static void sgemm_rowmajor_micro_kernel_12x4(const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k, const int k0, + const int stride) +{ + int rk = (k & 1); + int nk = (k + 1) / 2; + + const int nstride = stride << 2; + + asm volatile("vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + + "cmp %[k0], #0\n" + "beq 0f\n" + + "mov r0, %[res_ptr]\n" + "subs %[nk], %[nk], #1\n" + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d28-d29}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d30-d31}, [r0]\n" + "beq 2f\n" + "b 1f\n" + + "0:\n" + "veor q4, q4\n" + "subs %[nk],%[nk], #1\n" + "vmov.f32 q5, q4\n" + "vmov.f32 q6, q4\n" + "vmov.f32 q7, q4\n" + "vmov.f32 q8, q4\n" + "vmov.f32 q9, q4\n" + "vmov.f32 q10, q4\n" + "vmov.f32 q11, q4\n" + "vmov.f32 q12, q4\n" + "vmov.f32 q13, q4\n" + "vmov.f32 q14, q4\n" + "vmov.f32 q15, q4\n" + + "beq 2f\n" + + "1:\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q10, q2, d3[0]\n" + "vmla.f32 q11, q2, d3[1]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q13, q2, d0[1]\n" + "pld [%[rhs_ptr], #208]\n" + "vmla.f32 q14, q2, d1[0]\n" + "pld [%[lhs_ptr], #192]\n" + "vmla.f32 q15, q2, d1[1]\n" + + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q3, d2[0]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vmla.f32 q6, q3, d3[0]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q8, q3, d0[0]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vld1.f32 {d4-d5}, [%[rhs_ptr]]!\n" + "vmla.f32 q10, q3, d1[0]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q12, q3, d2[0]\n" + "vmla.f32 q13, q3, d2[1]\n" + "subs %[nk],%[nk], #1\n" + "pld [%[rhs_ptr], #240]\n" + "vmla.f32 q14, q3, d3[0]\n" + "pld [%[lhs_ptr], #208]\n" + "vmla.f32 q15, q3, d3[1]\n" + "bne 1b\n" + + "2:\n" + "cmp %[rk], #1\n" + "beq 3f\n" + + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vmla.f32 q10, q2, d3[0]\n" + "vmla.f32 q11, q2, d3[1]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q12, q2, d0[0]\n" + "vmla.f32 q13, q2, d0[1]\n" + "vmla.f32 q14, q2, d1[0]\n" + "vmla.f32 q15, q2, d1[1]\n" + + "mov r0, %[res_ptr]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q3, d2[0]\n" + "vst1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q5, q3, d2[1]\n" + "vst1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q3, d3[0]\n" + "vst1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q7, q3, d3[1]\n" + "vst1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q8, q3, d0[0]\n" + "vst1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q9, q3, d0[1]\n" + "vst1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q3, d1[0]\n" + "vst1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q11, q3, d1[1]\n" + "vst1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q3, d2[0]\n" + "vst1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q13, q3, d2[1]\n" + "vst1.f32 {d26-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q3, d3[0]\n" + "vst1.f32 {d28-d29}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q15, q3, d3[1]\n" + "b 4f\n" + + "3:\n" + "mov r0, %[res_ptr]\n" + "vld1.f32 {d2-d3}, [%[lhs_ptr]]!\n" + "vmla.f32 q4, q2, d0[0]\n" + "vst1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q5, q2, d0[1]\n" + "vst1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q6, q2, d1[0]\n" + "vst1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q7, q2, d1[1]\n" + "vst1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vld1.f32 {d0-d1}, [%[lhs_ptr]]!\n" + "vmla.f32 q8, q2, d2[0]\n" + "vst1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q9, q2, d2[1]\n" + "vst1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q10, q2, d3[0]\n" + "vst1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q11, q2, d3[1]\n" + "vst1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q12, q2, d0[0]\n" + "vst1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q13, q2, d0[1]\n" + "vst1.f32 {d26-d27}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q14, q2, d1[0]\n" + "vst1.f32 {d28-d29}, [r0]\n" + "add r0, r0, %[nstride]\n" + "vmla.f32 q15, q3, d1[1]\n" + + "4:\n" + "vst1.f32 {d30-d31}, [r0]\n" + : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), + [nk] "+r"(nk), [rk] "+r"(rk) + : [k0] "r"(k0), [nstride] "r"(nstride) + : "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15", "cc"); +} +#endif // __aarch64__ + +typedef void (*sgemm_rowmajoy_micro_kernel_func)(const float *, const float *, float *, const int, + const int, const int); + +static sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel_table[12][12] = { + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { + + 0, 0, 0, 0, 0, +#if !__aarch64__ + sgemm_rowmajor_micro_kernel_4x12, +#else // !__aarch64__ + 0, +#endif // !__aarch64__ + 0, 0, 0, 0, 0, +#if __aarch64__ + sgemm_rowmajor_micro_kernel_4x24 +#else // __aarch64__ + 0 +#endif // __aarch64__ + }, + {0, 0, 0, +#if !__aarch64__ + sgemm_rowmajor_micro_kernel_6x8, +#else // !__aarch64__ + 0, +#endif // !__aarch64__ + 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, +#if __aarch64__ + sgemm_rowmajor_micro_kernel_8x12, +#else // __aarch64__ + 0, +#endif // __aarch64__ + 0, 0, 0, 0, 0, 0 + + }, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + }, + {0, +#if !__aarch64__ + sgemm_rowmajor_micro_kernel_12x4, +#else // !__aarch64__ + 0, +#endif // !__aarch64__ + 0, +#if __aarch64__ + sgemm_rowmajor_micro_kernel_12x8, +#else // __aarch64__ + 0, +#endif // __aarch64__ + 0, 0, 0, 0, 0, 0, 0, 0 + + }, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + }, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + }, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + }, + {0, +#if __aarch64__ + sgemm_rowmajor_micro_kernel_24x4, +#else // __aarch64__ + 0, +#endif // __aarch64__ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + }, + +}; + +void _sgemm_rowmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int nstride, + const int kstride) +{ + const int nm = (mb + mr - 1) / mr; + const int nn = (nb + nr - 1) / nr; + const int rm = mb % mr; + const int rn = nb % nr; + + sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel = + sgemm_rowmajoy_micro_kernel_table[mr / 2 - 1][nr / 2 - 1]; + if (!sgemm_rowmajoy_micro_kernel) + return; + + for (int j = 0; j < nn; j++) + { + const int _nr = (j != nn - 1 || rn == 0) ? nr : rn; + for (int i = 0; i < nm; i++) + { + const int _mr = (i != nm - 1 || rm == 0) ? mr : rm; + if (_mr == mr && _nr == nr) + { + sgemm_rowmajoy_micro_kernel(&lhs_ptr[i * mr * kstride], &rhs_ptr[j * nr * kstride], + &res_ptr[i * mr * nstride + j * nr], kb, k0, nstride); + } + else + { + float res_micro[mr * nr]; + float *res = &res_ptr[i * mr * nstride + j * nr]; + + sgemm_rowmajoy_micro_kernel(&lhs_ptr[i * mr * kstride], &rhs_ptr[j * nr * kstride], + res_micro, kb, 0, nr); + if (k0 == 0) + { + for (int pi = 0; pi < _mr; pi++) + { + for (int pj = 0; pj < _nr; pj++) + { + res[pi * nstride + pj] = res_micro[pi * nr + pj]; + } + } + } + else + { + for (int pi = 0; pi < _mr; pi++) + { + for (int pj = 0; pj < _nr; pj++) + { + res[pi * nstride + pj] += res_micro[pi * nr + pj]; + } + } + } + } + } + } +} + +void _sgemm_rowmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int nstride, + const int kstride) +{ + const int nm = (mb + mr - 1) / mr; + const int nn = (nb + nr - 1) / nr; + const int rm = mb % mr; + const int rn = nb % nr; + + sgemm_rowmajoy_micro_kernel_func sgemm_rowmajoy_micro_kernel = + sgemm_rowmajoy_micro_kernel_table[mr / 2 - 1][nr / 2 - 1]; + if (!sgemm_rowmajoy_micro_kernel) + return; + + for (int j = 0; j < nm; j++) + { + const int _mr = (j != nm - 1 || rm == 0) ? mr : rm; + for (int i = 0; i < nn; i++) + { + const int _nr = (i != nn - 1 || rn == 0) ? nr : rn; + if (_mr == mr && _nr == nr) + { + sgemm_rowmajoy_micro_kernel(&lhs_ptr[j * mr * kstride], &rhs_ptr[i * nr * kstride], + &res_ptr[j * mr * nstride + i * nr], kb, k0, nstride); + } + else + { + float res_micro[mr * nr]; + float *res = &res_ptr[j * mr * nstride + i * nr]; + + sgemm_rowmajoy_micro_kernel(&lhs_ptr[j * mr * kstride], &rhs_ptr[i * nr * kstride], + res_micro, kb, 0, nr); + if (k0 == 0) + { + for (int pi = 0; pi < _mr; pi++) + { + for (int pj = 0; pj < _nr; pj++) + { + res[pi * nstride + pj] = res_micro[pi * nr + pj]; + } + } + } + else + { + for (int pi = 0; pi < _mr; pi++) + { + for (int pj = 0; pj < _nr; pj++) + { + res[pi * nstride + pj] += res_micro[pi * nr + pj]; + } + } + } + } + } + } +} + +void _sgemm_colmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int mstride, + const int kstride) +{ + _sgemm_rowmajor_macro_kernel_divmn(nr, mr, nb, mb, kb, rhs_ptr, lhs_ptr, res_ptr, k0, mstride, + kstride); +} + +void _sgemm_colmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int mstride, + const int kstride) +{ + _sgemm_rowmajor_macro_kernel_divnm(nr, mr, nb, mb, kb, rhs_ptr, lhs_ptr, res_ptr, k0, mstride, + kstride); +} + +#if __aarch64__ +void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr) +{ + int nn = nb >> 3; + int rn = nb & 7; + + if (nn > 0) + { + asm volatile("mov x0, %[res_ptr]\n" + "dup v0.2d, %[lhs_data]\n" + "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v2.4s}, [x0], #16\n" + + "subs %[nn], %[nn], #1\n" + "beq 2f\n" + + "1:\n" + "ld1 {v4.4s}, [x0], #16\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + + "fmla v2.4s, v1.4s, v0.s[0]\n" + "st1 {v2.4s}, [%[res_ptr]], #16\n" + + "ld1 {v2.4s}, [x0], #16\n" + "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" + + "fmla v4.4s, v3.4s, v0.s[0]\n" + "st1 {v4.4s}, [%[res_ptr]], #16\n" + + "subs %[nn], %[nn], #1\n" + "bne 1b\n" + + "2:\n" + "ld1 {v3.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v4.4s}, [x0], #16\n" + + "fmla v2.4s, v1.4s, v0.s[0]\n" + "st1 {v2.4s}, [%[res_ptr]], #16\n" + + "fmla v4.4s, v3.4s, v0.s[0]\n" + "st1 {v4.4s}, [%[res_ptr]], #16\n" + : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), [nn] "+r"(nn) + : [lhs_data] "r"(lhs_data) + : "x0", "v0", "v1", "v2", "v3", "v4", "cc"); + } + if (rn > 0) + { + int _nn = rn >> 2; + int _rn = rn & 3; + + if (_nn > 0) + { + asm volatile("dup v0.2d, %[lhs_data]\n" + "ld1 {v1.4s}, [%[rhs_ptr]], #16\n" + "ld1 {v2.4s}, [%[res_ptr]]\n" + "fmla v2.4s, v1.4s, v0.s[0]\n" + "st1 {v2.4s}, [%[res_ptr]], #16\n" + : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr) + : [lhs_data] "r"(lhs_data) + : "x0", "x1", "x2", "cc"); + } + if (_rn > 0) + { + for (int i = 0; i < _rn; i++) + { + res_ptr[i] += lhs_data * rhs_ptr[i]; + } + } + } +} + +#else // __aarch64__ +void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr) +{ + int nn = nb >> 3; + int rn = nb & 7; + + if (nn > 0) + { + asm volatile("mov r0, %[res_ptr]\n" + "vdup.32 d0, %[lhs_data]\n" + "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n" + "vld1.f32 {d4-d5}, [r0]!\n" + + "subs %[nn], %[nn], #1\n" + "beq 2f\n" + + "1:\n" + "vld1.f32 {d8-d9}, [r0]!\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + + "vmla.f32 q2, q1, d0[0]\n" + "vst1.f32 {d4-d5}, [%[res_ptr]]!\n" + + "vld1.f32 {d4-d5}, [r0]!\n" + "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n" + + "vmla.f32 q4, q3, d0[0]\n" + "vst1.f32 {d8-d9}, [%[res_ptr]]!\n" + + "subs %[nn], %[nn], #1\n" + "bne 1b\n" + + "2:\n" + "vld1.f32 {d6-d7}, [%[rhs_ptr]]!\n" + "vld1.f32 {d8-d9}, [r0]!\n" + + "vmla.f32 q2, q1, d0[0]\n" + "vst1.f32 {d4-d5}, [%[res_ptr]]!\n" + + "vmla.f32 q4, q3, d0[0]\n" + "vst1.f32 {d8-d9}, [%[res_ptr]]!\n" + : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr), [nn] "+r"(nn) + : [lhs_data] "r"(lhs_data) + : "r0", "q0", "q1", "q2", "q3", "q4", "cc"); + } + if (rn > 0) + { + int _nn = rn >> 2; + int _rn = rn & 3; + + if (_nn > 0) + { + asm volatile("vdup.32 d0, %[lhs_data]\n" + "vld1.f32 {d2-d3}, [%[rhs_ptr]]!\n" + "vld1.f32 {d4-d5}, [%[res_ptr]]\n" + "vmla.f32 q2, q1, d0[0]\n" + "vst1.f32 {d4-d5}, [%[res_ptr]]!\n" + : [rhs_ptr] "+r"(rhs_ptr), [res_ptr] "+r"(res_ptr) + : [lhs_data] "r"(lhs_data) + : "q0", "q1", "q2", "cc"); + } + if (_rn > 0) + { + for (int i = 0; i < _rn; i++) + { + res_ptr[i] += lhs_data * rhs_ptr[i]; + } + } + } +} +#endif // __aarch64__ + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/sgemm_kernel.h b/compute/ncnn/src/srcn/sgemm_kernel.h new file mode 100644 index 000000000..9e220bc33 --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_kernel.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_SGEMM_KERNEL_H__ +#define __NNFW_SRCN_SGEMM_KERNEL_H__ + +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +void _sgemm_rowmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int nstride, + const int kstride); + +void _sgemm_rowmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int nstride, + const int kstride); + +void _sgemm_colmajor_macro_kernel_divnm(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int mstride, + const int kstride); + +void _sgemm_colmajor_macro_kernel_divmn(const int mr, const int nr, const int mb, const int nb, + const int kb, const float *lhs_ptr, const float *rhs_ptr, + float *res_ptr, const int k0, const int mstride, + const int kstride); + +void _sparse_sgemm_kernel(const int nb, float lhs_data, const float *rhs_ptr, float *res_ptr); + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_SGEMM_KERNEL_H__ diff --git a/compute/ncnn/src/srcn/sgemm_pack.cc b/compute/ncnn/src/srcn/sgemm_pack.cc new file mode 100644 index 000000000..8767f6c0a --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_pack.cc @@ -0,0 +1,2316 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdlib.h> +#include <arm_neon.h> + +#include "ncnn/srcn/conv_type.h" +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +void _pack_rowmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr) +{ + const int nm = mb / mr; + const int rm = mb % mr; + + switch (mr) + { +#if __aarch64__ + case 24: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v13.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v14.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v15.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v12.4s, v14.4s\n" + "zip2 v30.4s, v12.4s, v14.4s\n" + "zip1 v29.4s, v13.4s, v15.4s\n" + "zip2 v31.4s, v13.4s, v15.4s\n" + "zip1 v12.4s, v28.4s, v29.4s\n" + "zip2 v13.4s, v28.4s, v29.4s\n" + "zip1 v14.4s, v30.4s, v31.4s\n" + "zip2 v15.4s, v30.4s, v31.4s\n" + + "ld1 {v16.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v17.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v18.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v19.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v16.4s, v18.4s\n" + "zip2 v30.4s, v16.4s, v18.4s\n" + "zip1 v29.4s, v17.4s, v19.4s\n" + "zip2 v31.4s, v17.4s, v19.4s\n" + "zip1 v16.4s, v28.4s, v29.4s\n" + "zip2 v17.4s, v28.4s, v29.4s\n" + "zip1 v18.4s, v30.4s, v31.4s\n" + "zip2 v19.4s, v30.4s, v31.4s\n" + + "ld1 {v20.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v21.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v22.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v23.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v20.4s, v22.4s\n" + "zip2 v30.4s, v20.4s, v22.4s\n" + "zip1 v29.4s, v21.4s, v23.4s\n" + "zip2 v31.4s, v21.4s, v23.4s\n" + "zip1 v20.4s, v28.4s, v29.4s\n" + "zip2 v21.4s, v28.4s, v29.4s\n" + "zip1 v22.4s, v30.4s, v31.4s\n" + "zip2 v23.4s, v30.4s, v31.4s\n" + + "ld1 {v24.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v25.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v26.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v27.4s}, [x0]\n" + + "zip1 v28.4s, v24.4s, v26.4s\n" + "zip2 v30.4s, v24.4s, v26.4s\n" + "zip1 v29.4s, v25.4s, v27.4s\n" + "zip2 v31.4s, v25.4s, v27.4s\n" + "zip1 v24.4s, v28.4s, v29.4s\n" + "zip2 v25.4s, v28.4s, v29.4s\n" + "zip1 v26.4s, v30.4s, v31.4s\n" + "zip2 v27.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v8.4s}, [%[plhs_ptr]], #16\n" + "st1 {v12.4s}, [%[plhs_ptr]], #16\n" + "st1 {v16.4s}, [%[plhs_ptr]], #16\n" + "st1 {v20.4s}, [%[plhs_ptr]], #16\n" + "st1 {v24.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v9.4s}, [%[plhs_ptr]], #16\n" + "st1 {v13.4s}, [%[plhs_ptr]], #16\n" + "st1 {v17.4s}, [%[plhs_ptr]], #16\n" + "st1 {v21.4s}, [%[plhs_ptr]], #16\n" + "st1 {v25.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v10.4s}, [%[plhs_ptr]], #16\n" + "st1 {v14.4s}, [%[plhs_ptr]], #16\n" + "st1 {v18.4s}, [%[plhs_ptr]], #16\n" + "st1 {v22.4s}, [%[plhs_ptr]], #16\n" + "st1 {v26.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + "st1 {v11.4s}, [%[plhs_ptr]], #16\n" + "st1 {v15.4s}, [%[plhs_ptr]], #16\n" + "st1 {v19.4s}, [%[plhs_ptr]], #16\n" + "st1 {v23.4s}, [%[plhs_ptr]], #16\n" + "st1 {v27.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr[4] = lhs_temp[stride << 2]; + plhs_ptr[5] = lhs_temp[5 * stride]; + plhs_ptr[6] = lhs_temp[6 * stride]; + plhs_ptr[7] = lhs_temp[7 * stride]; + plhs_ptr[8] = lhs_temp[stride << 3]; + plhs_ptr[9] = lhs_temp[9 * stride]; + plhs_ptr[10] = lhs_temp[10 * stride]; + plhs_ptr[11] = lhs_temp[11 * stride]; + plhs_ptr[12] = lhs_temp[0]; + plhs_ptr[13] = lhs_temp[13 * stride]; + plhs_ptr[14] = lhs_temp[14 * stride]; + plhs_ptr[15] = lhs_temp[15 * stride]; + plhs_ptr[16] = lhs_temp[stride << 4]; + plhs_ptr[17] = lhs_temp[17 * stride]; + plhs_ptr[18] = lhs_temp[18 * stride]; + plhs_ptr[19] = lhs_temp[19 * stride]; + plhs_ptr[20] = lhs_temp[20 * stride]; + plhs_ptr[21] = lhs_temp[21 * stride]; + plhs_ptr[22] = lhs_temp[22 * stride]; + plhs_ptr[23] = lhs_temp[23 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; + case 16: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v13.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v14.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v15.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v12.4s, v14.4s\n" + "zip2 v30.4s, v12.4s, v14.4s\n" + "zip1 v29.4s, v13.4s, v15.4s\n" + "zip2 v31.4s, v13.4s, v15.4s\n" + "zip1 v12.4s, v28.4s, v29.4s\n" + "zip2 v13.4s, v28.4s, v29.4s\n" + "zip1 v14.4s, v30.4s, v31.4s\n" + "zip2 v15.4s, v30.4s, v31.4s\n" + + "ld1 {v16.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v17.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v18.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v19.4s}, [x0]\n" + + "zip1 v28.4s, v16.4s, v18.4s\n" + "zip2 v30.4s, v16.4s, v18.4s\n" + "zip1 v29.4s, v17.4s, v19.4s\n" + "zip2 v31.4s, v17.4s, v19.4s\n" + "zip1 v16.4s, v28.4s, v29.4s\n" + "zip2 v17.4s, v28.4s, v29.4s\n" + "zip1 v18.4s, v30.4s, v31.4s\n" + "zip2 v19.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v8.4s}, [%[plhs_ptr]], #16\n" + "st1 {v12.4s}, [%[plhs_ptr]], #16\n" + "st1 {v16.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v9.4s}, [%[plhs_ptr]], #16\n" + "st1 {v13.4s}, [%[plhs_ptr]], #16\n" + "st1 {v17.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v10.4s}, [%[plhs_ptr]], #16\n" + "st1 {v14.4s}, [%[plhs_ptr]], #16\n" + "st1 {v18.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + "st1 {v11.4s}, [%[plhs_ptr]], #16\n" + "st1 {v15.4s}, [%[plhs_ptr]], #16\n" + "st1 {v19.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", + "v30", "v31"); + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr[4] = lhs_temp[stride << 2]; + plhs_ptr[5] = lhs_temp[5 * stride]; + plhs_ptr[6] = lhs_temp[6 * stride]; + plhs_ptr[7] = lhs_temp[7 * stride]; + plhs_ptr[8] = lhs_temp[stride << 3]; + plhs_ptr[9] = lhs_temp[9 * stride]; + plhs_ptr[10] = lhs_temp[10 * stride]; + plhs_ptr[11] = lhs_temp[11 * stride]; + plhs_ptr[12] = lhs_temp[0]; + plhs_ptr[13] = lhs_temp[13 * stride]; + plhs_ptr[14] = lhs_temp[14 * stride]; + plhs_ptr[15] = lhs_temp[15 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; +#endif // __aarch64__ + case 12: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v13.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v14.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v15.4s}, [x0]\n" + + "zip1 v28.4s, v12.4s, v14.4s\n" + "zip2 v30.4s, v12.4s, v14.4s\n" + "zip1 v29.4s, v13.4s, v15.4s\n" + "zip2 v31.4s, v13.4s, v15.4s\n" + "zip1 v12.4s, v28.4s, v29.4s\n" + "zip2 v13.4s, v28.4s, v29.4s\n" + "zip1 v14.4s, v30.4s, v31.4s\n" + "zip2 v15.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v8.4s}, [%[plhs_ptr]], #16\n" + "st1 {v12.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v9.4s}, [%[plhs_ptr]], #16\n" + "st1 {v13.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v10.4s}, [%[plhs_ptr]], #16\n" + "st1 {v14.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + "st1 {v11.4s}, [%[plhs_ptr]], #16\n" + "st1 {v15.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[lhs_temp]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q8, q10\n" + "vzip.32 q9, q11\n" + "vzip.32 q8, q9\n" + "vzip.32 q10, q11\n" + + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d28-d29}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d30-d31}, [r0]\n" + + "vzip.32 q12, q14\n" + "vzip.32 q13, q15\n" + "vzip.32 q12, q13\n" + "vzip.32 q14, q15\n" + + "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n" + "vst1.f32 {d16-d17}, [%[plhs_ptr]]!\n" + "vst1.f32 {d24-d25}, [%[plhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n" + "vst1.f32 {d18-d19}, [%[plhs_ptr]]!\n" + "vst1.f32 {d26-d27}, [%[plhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n" + "vst1.f32 {d20-d21}, [%[plhs_ptr]]!\n" + "vst1.f32 {d28-d29}, [%[plhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n" + "vst1.f32 {d22-d23}, [%[plhs_ptr]]!\n" + "vst1.f32 {d30-d31}, [%[plhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr[4] = lhs_temp[stride << 2]; + plhs_ptr[5] = lhs_temp[5 * stride]; + plhs_ptr[6] = lhs_temp[6 * stride]; + plhs_ptr[7] = lhs_temp[7 * stride]; + plhs_ptr[8] = lhs_temp[stride << 3]; + plhs_ptr[9] = lhs_temp[9 * stride]; + plhs_ptr[10] = lhs_temp[10 * stride]; + plhs_ptr[11] = lhs_temp[11 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; + case 8: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v8.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v9.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v10.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + "st1 {v11.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[lhs_temp]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vzip.32 q8, q10\n" + "vzip.32 q9, q11\n" + "vzip.32 q8, q9\n" + "vzip.32 q10, q11\n" + + "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n" + "vst1.f32 {d16-d17}, [%[plhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n" + "vst1.f32 {d18-d19}, [%[plhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n" + "vst1.f32 {d20-d21}, [%[plhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n" + "vst1.f32 {d22-d23}, [%[plhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr[4] = lhs_temp[stride << 2]; + plhs_ptr[5] = lhs_temp[5 * stride]; + plhs_ptr[6] = lhs_temp[6 * stride]; + plhs_ptr[7] = lhs_temp[7 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; + case 6: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { +#if __aarch64__ + // TODO: 4--->6 + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v8.4s}, [x0]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[lhs_temp]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + "vzip.32 q8, q9\n" + + "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n" + "vst1.f32 {d16}, [%[plhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n" + "vst1.f32 {d17}, [%[plhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n" + "vst1.f32 {d18}, [%[plhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n" + "vst1.f32 {d19}, [%[plhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr[4] = lhs_temp[stride << 2]; + plhs_ptr[5] = lhs_temp[5 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; + case 4: + for (int i = 0; i < nm; i++) + { + int nk = kb >> 2; + int rk = kb & 0x03; + + const float *lhs_temp = lhs_ptr; + const int _stride = stride << 2; + + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[lhs_temp]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[plhs_ptr]], #16\n" + "st1 {v5.4s}, [%[plhs_ptr]], #16\n" + "st1 {v6.4s}, [%[plhs_ptr]], #16\n" + "st1 {v7.4s}, [%[plhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[lhs_temp]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vst1.f32 {d8-d9}, [%[plhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[plhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[plhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[plhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[lhs_temp], %[lhs_temp], #16\n" + "bne 0b\n" + : [lhs_temp] "+r"(lhs_temp), [plhs_ptr] "+r"(plhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + plhs_ptr[0] = lhs_temp[0]; + plhs_ptr[1] = lhs_temp[stride]; + plhs_ptr[2] = lhs_temp[stride << 1]; + plhs_ptr[3] = lhs_temp[3 * stride]; + plhs_ptr += mr; + lhs_temp++; + } + + lhs_ptr += mr * stride; + } + break; + default: + break; + } + + if (rm > 0) + { + for (int j = 0; j < kb; j++) + { + for (int i = 0; i < rm; i++) + { + plhs_ptr[i] = lhs_ptr[i * stride]; + } + for (int i = rm; i < mr; i++) + { + plhs_ptr[i] = 0.f; + } + plhs_ptr += mr; + lhs_ptr++; + } + } +} + +void _pack_rowmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr) +{ + const int nn = nb / nr; + const int rn = nb % nr; + + switch (nr) + { + case 24: + for (int j = 0; j < nn; j++) + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q1, q2, q3, q4, q5; + for (int i = 0; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + 4); + q2 = vld1q_f32(rhs_temp + 8); + q3 = vld1q_f32(rhs_temp + 12); + q4 = vld1q_f32(rhs_temp + 16); + q5 = vld1q_f32(rhs_temp + 20); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + vst1q_f32(prhs_ptr + 16, q4); + vst1q_f32(prhs_ptr + 20, q5); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + case 16: + for (int j = 0; j < nn; j++) + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q1, q2, q3; + for (int i = 0; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + 4); + q2 = vld1q_f32(rhs_temp + 8); + q3 = vld1q_f32(rhs_temp + 12); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + case 12: + for (int j = 0; j < nn; j++) + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q1, q2; + for (int i = 0; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + 4); + q2 = vld1q_f32(rhs_temp + 8); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + case 8: + for (int j = 0; j < nn; j++) + + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q1, q2, q3; + + int i = 0; + for (; i + 1 < kb; i += 2) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + 4); + q2 = vld1q_f32(rhs_temp + stride); + q3 = vld1q_f32(rhs_temp + stride + 4); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + + rhs_temp += stride << 1; + prhs_ptr += nr << 1; + } + + for (; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + 4); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + case 6: + for (int j = 0; j < nn; j++) + + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q2; + float32x2_t q1, q3; + + int i = 0; + for (; i + 1 < kb; i += 2) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1_f32(rhs_temp + 4); + + q2 = vld1q_f32(rhs_temp + stride); + q3 = vld1_f32(rhs_temp + stride + 4); + vst1q_f32(prhs_ptr, q0); + vst1_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 6, q2); + vst1_f32(prhs_ptr + 10, q3); + + rhs_temp += stride << 1; + prhs_ptr += nr << 1; + } + + for (; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1_f32(rhs_temp + 4); + + vst1q_f32(prhs_ptr, q0); + vst1_f32(prhs_ptr + 4, q1); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + case 4: + for (int j = 0; j < nn; j++) + + { + const float *rhs_temp = rhs_ptr; + float32x4_t q0, q1, q2, q3; + + int i = 0; + for (; i + 3 < kb; i += 4) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + stride); + q2 = vld1q_f32(rhs_temp + (stride << 1)); + q3 = vld1q_f32(rhs_temp + (stride * 3)); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + + rhs_temp += stride << 2; + prhs_ptr += nr << 2; + } + for (; i + 1 < kb; i += 2) + { + q0 = vld1q_f32(rhs_temp); + q1 = vld1q_f32(rhs_temp + stride); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + + rhs_temp += stride << 1; + prhs_ptr += nr << 1; + } + for (; i < kb; i++) + { + q0 = vld1q_f32(rhs_temp); + vst1q_f32(prhs_ptr, q0); + + rhs_temp += stride; + prhs_ptr += nr; + } + + rhs_ptr += nr; + } + break; + default: + break; + } + + if (rn > 0) + { + for (int i = 0; i < kb; i++) + { + for (int j = 0; j < rn; j++) + { + prhs_ptr[j] = rhs_ptr[j]; + } + for (int j = rn; j < nr; j++) + { + prhs_ptr[j] = 0.f; + } + prhs_ptr += nr; + rhs_ptr += stride; + } + } +} + +void _pack_rowmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr) +{ + _pack_rowmajor_notrans_rhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr); +} + +void _pack_rowmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr) +{ + _pack_rowmajor_notrans_lhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr); +} + +static inline void _pack_rowmajor_image_subn(const int nr, const int nb, const int stride, + const float *buffer, float *prhs_ptr) +{ + const int nn = nb / nr; + const int rn = nb % nr; + + switch (nr) + { + case 24: + for (int j = 0; j < nn; j++) + { + float32x4_t q0, q1, q2, q3, q4, q5; + q0 = vld1q_f32(buffer); + q1 = vld1q_f32(buffer + 4); + q2 = vld1q_f32(buffer + 8); + q3 = vld1q_f32(buffer + 12); + q4 = vld1q_f32(buffer + 16); + q5 = vld1q_f32(buffer + 20); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + vst1q_f32(prhs_ptr + 16, q4); + vst1q_f32(prhs_ptr + 20, q5); + prhs_ptr += stride; + buffer += nr; + } + break; + case 16: + for (int j = 0; j < nn; j++) + { + float32x4_t q0, q1, q2, q3; + q0 = vld1q_f32(buffer); + q1 = vld1q_f32(buffer + 4); + q2 = vld1q_f32(buffer + 8); + q3 = vld1q_f32(buffer + 12); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + vst1q_f32(prhs_ptr + 12, q3); + prhs_ptr += stride; + buffer += nr; + } + break; + case 12: + for (int j = 0; j < nn; j++) + { + float32x4_t q0, q1, q2; + q0 = vld1q_f32(buffer); + q1 = vld1q_f32(buffer + 4); + q2 = vld1q_f32(buffer + 8); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + vst1q_f32(prhs_ptr + 8, q2); + prhs_ptr += stride; + buffer += nr; + } + break; + case 8: + for (int j = 0; j < nn; j++) + { + float32x4_t q0, q1; + q0 = vld1q_f32(buffer); + q1 = vld1q_f32(buffer + 4); + vst1q_f32(prhs_ptr, q0); + vst1q_f32(prhs_ptr + 4, q1); + prhs_ptr += stride; + buffer += nr; + } + break; + case 6: + for (int j = 0; j < nn; j++) + { + float32x4_t q0; + float32x2_t q1; + q0 = vld1q_f32(buffer); + q1 = vld1_f32(buffer + 4); + vst1q_f32(prhs_ptr, q0); + vst1_f32(prhs_ptr + 4, q1); + prhs_ptr += stride; + buffer += nr; + } + break; + case 4: + for (int j = 0; j < nn; j++) + { + float32x4_t q0; + q0 = vld1q_f32(buffer); + vst1q_f32(prhs_ptr, q0); + prhs_ptr += stride; + buffer += nr; + } + break; + default: + break; + } + + if (rn > 0) + { + for (int j = 0; j < rn; j++) + { + prhs_ptr[j] = buffer[j]; + } + for (int j = rn; j < nr; j++) + { + prhs_ptr[j] = 0.f; + } + } +} + +void _pack_rowmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *prhs_ptr) +{ + const int w = input->w; + const int h = input->h; + const int outw = output->w; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + + const int in_row0 = n0 / outw * stride_h; + const int in_col0 = n0 % outw * stride_w; + int seg0 = outw - n0 % outw; + if (seg0 > nb) + seg0 = nb; + int rows = (nb - seg0 + outw - 1) / outw; + if (seg0) + rows++; + const int segn = (nb - seg0) % outw; + + float row_data[nb]; + + for (int i = k0; i < kb + k0; i++) + { + const int ic = i / (kernel_w * kernel_h); + const int in_row1 = ((i / kernel_w) % kernel_h) * params->dilation_h + in_row0; + const int in_col1 = i % kernel_w * params->dilation_w; + +#ifdef NCNN + const float *input_data = input->data + ic * alignSize(w * h, 16 / sizeof(float)); +#else // NCNN + const float *input_data = input->data + ic * w * h; +#endif // NCNN + float *buffer = row_data; + int in_row = in_row1 - pad_h; + + for (int out_rows = rows; out_rows; out_rows--) + { + int cols = (out_rows != 1 || segn == 0) ? outw : segn; + int in_col = in_col1 - pad_w; + if (out_rows == rows) + { + cols = seg0; + in_col += in_col0; + } + if ((unsigned int)in_row < (unsigned int)h) + { + for (int out_col = cols; out_col; out_col--) + { + if ((unsigned int)in_col < (unsigned int)w) + *(buffer++) = input_data[in_row * w + in_col]; + else + *(buffer++) = 0; + in_col += stride_w; + } + } + else + { + for (int out_col = cols; out_col; out_col--) + { + *(buffer++) = 0; + in_col += stride_w; + } + } + + in_row += stride_h; + } + + _pack_rowmajor_image_subn(nr, nb, nr * kb, row_data, prhs_ptr); + prhs_ptr += nr; + } +} + +void _pack_rowmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0, + const int n0, convMat_t *input, convMat_t *output, + convParams_t *params, float *prhs_ptr) +{ + const int w = input->w; + const int h = input->h; + const int c = input->c; + +#ifdef NCNN + const int seg_size = alignSize(output->w * output->h, 16 / sizeof(float)); +#else // NCNN + const int seg_size = output->w * output->h; +#endif // NCNN + +#ifdef NCNN + float *data = input->data + (alignSize(w * h, 16 / sizeof(float)) * c) * (n0 / seg_size); +#else // NCNN + float *data = input->data + (w * h * c) * (n0 / seg_size); +#endif // NCNN + + int seg0 = seg_size - n0 % seg_size; + if (seg0 > nb) + seg0 = nb; + int nseg = (nb - seg0 + seg_size - 1) / seg_size; + if (seg0) + nseg++; + const int segn = (nb - seg0) % seg_size; + convMat_t _input = {w, h, c, 1, data}; + + for (int i = 0; i < nseg; i++) + { + const int _nb = (i == 0 ? seg0 : (i == nseg - 1 ? segn : seg_size)); + const int _n0 = (i == 0 ? seg_size - seg0 : 0); + + _pack_rowmajor_image_rhs(nr, _nb, kb, k0, _n0, &_input, output, params, prhs_ptr); + +#ifdef NCNN + _input.data += alignSize(w * h, 16 / sizeof(float)) * c; +#else // NCNN + _input.data += w * h * c; +#endif // NCNN + } +} + +void _unpack_rowmajor_image_res(const int mb, const int nb, const int m0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *pres_ptr) +{ + const int outw = output->w; + const int outh = output->h; + const int w = input->w; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + + const int out_row0 = n0 / w * stride_h; + const int out_col0 = n0 % w * stride_w; + int seg0 = w - n0 % w; + if (seg0 > nb) + seg0 = nb; + int rows = (nb - seg0 + w - 1) / w; + if (seg0) + rows++; + const int segn = (nb - seg0) % w; + + for (int i = m0; i < mb + m0; i++) + { + const int oc = i / (kernel_w * kernel_h); + const int out_row1 = ((i / kernel_w) % kernel_h) * params->dilation_h + out_row0; + const int out_col1 = i % kernel_w * params->dilation_w; + +#ifdef NCNN + float *output_data = output->data + oc * alignSize(outw * outh, 16 / sizeof(float)); +#else // NCNN + float *output_data = output->data + oc * outw * outh; +#endif // NCNN + int out_row = out_row1 - pad_h; + + for (int in_rows = rows; in_rows; in_rows--) + { + int cols = (in_rows != 1 || segn == 0) ? w : segn; + int out_col = out_col1 - pad_w; + if (in_rows == rows) + { + cols = seg0; + out_col += out_col0; + } + if ((unsigned int)out_row < (unsigned int)outh) + { + for (int in_col = cols; in_col; in_col--) + { + if ((unsigned int)out_col < (unsigned int)outw) + output_data[out_row * outw + out_col] += *pres_ptr++; + else + pres_ptr++; + out_col += stride_w; + } + } + else + { + pres_ptr += cols; + } + out_row += stride_h; + } + } +} + +// TODO:v8 & other case. +static inline void _pack_colmajor_image_rhs_sub(const int nr, const int k, const float *buffer, + float *prhs_ptr) +{ + int nk = k >> 2; + int rk = k & 0x03; + + const int _stride = k << 2; + + switch (nr) + { + case 12: + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[buffer]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "ld1 {v12.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v13.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v14.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v15.4s}, [x0]\n" + + "zip1 v28.4s, v12.4s, v14.4s\n" + "zip2 v30.4s, v12.4s, v14.4s\n" + "zip1 v29.4s, v13.4s, v15.4s\n" + "zip2 v31.4s, v13.4s, v15.4s\n" + "zip1 v12.4s, v28.4s, v29.4s\n" + "zip2 v13.4s, v28.4s, v29.4s\n" + "zip1 v14.4s, v30.4s, v31.4s\n" + "zip2 v15.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[prhs_ptr]], #16\n" + "st1 {v8.4s}, [%[prhs_ptr]], #16\n" + "st1 {v12.4s}, [%[prhs_ptr]], #16\n" + "st1 {v5.4s}, [%[prhs_ptr]], #16\n" + "st1 {v9.4s}, [%[prhs_ptr]], #16\n" + "st1 {v13.4s}, [%[prhs_ptr]], #16\n" + "st1 {v6.4s}, [%[prhs_ptr]], #16\n" + "st1 {v10.4s}, [%[prhs_ptr]], #16\n" + "st1 {v14.4s}, [%[prhs_ptr]], #16\n" + "st1 {v7.4s}, [%[prhs_ptr]], #16\n" + "st1 {v11.4s}, [%[prhs_ptr]], #16\n" + "st1 {v15.4s}, [%[prhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[buffer]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q8, q10\n" + "vzip.32 q9, q11\n" + "vzip.32 q8, q9\n" + "vzip.32 q10, q11\n" + + "vld1.f32 {d24-d25}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d26-d27}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d28-d29}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d30-d31}, [r0]\n" + + "vzip.32 q12, q14\n" + "vzip.32 q13, q15\n" + "vzip.32 q12, q13\n" + "vzip.32 q14, q15\n" + + "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n" + "vst1.f32 {d16-d17}, [%[prhs_ptr]]!\n" + "vst1.f32 {d24-d25}, [%[prhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n" + "vst1.f32 {d18-d19}, [%[prhs_ptr]]!\n" + "vst1.f32 {d26-d27}, [%[prhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n" + "vst1.f32 {d20-d21}, [%[prhs_ptr]]!\n" + "vst1.f32 {d28-d29}, [%[prhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n" + "vst1.f32 {d22-d23}, [%[prhs_ptr]]!\n" + "vst1.f32 {d30-d31}, [%[prhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + prhs_ptr[0] = buffer[0]; + prhs_ptr[1] = buffer[k]; + prhs_ptr[2] = buffer[k << 1]; + prhs_ptr[3] = buffer[3 * k]; + prhs_ptr[4] = buffer[k << 2]; + prhs_ptr[5] = buffer[5 * k]; + prhs_ptr[6] = buffer[6 * k]; + prhs_ptr[7] = buffer[7 * k]; + prhs_ptr[8] = buffer[k << 3]; + prhs_ptr[9] = buffer[9 * k]; + prhs_ptr[10] = buffer[10 * k]; + prhs_ptr[11] = buffer[11 * k]; + prhs_ptr += nr; + buffer++; + } + break; + + case 8: + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[buffer]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "ld1 {v8.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v9.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v10.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v11.4s}, [x0]\n" + + "zip1 v28.4s, v8.4s, v10.4s\n" + "zip2 v30.4s, v8.4s, v10.4s\n" + "zip1 v29.4s, v9.4s, v11.4s\n" + "zip2 v31.4s, v9.4s, v11.4s\n" + "zip1 v8.4s, v28.4s, v29.4s\n" + "zip2 v9.4s, v28.4s, v29.4s\n" + "zip1 v10.4s, v30.4s, v31.4s\n" + "zip2 v11.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[prhs_ptr]], #16\n" + "st1 {v8.4s}, [%[prhs_ptr]], #16\n" + "st1 {v5.4s}, [%[prhs_ptr]], #16\n" + "st1 {v9.4s}, [%[prhs_ptr]], #16\n" + "st1 {v6.4s}, [%[prhs_ptr]], #16\n" + "st1 {v10.4s}, [%[prhs_ptr]], #16\n" + "st1 {v7.4s}, [%[prhs_ptr]], #16\n" + "st1 {v11.4s}, [%[prhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[buffer]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d20-d21}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d22-d23}, [r0]\n" + + "vzip.32 q8, q10\n" + "vzip.32 q9, q11\n" + "vzip.32 q8, q9\n" + "vzip.32 q10, q11\n" + + "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n" + "vst1.f32 {d16-d17}, [%[prhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n" + "vst1.f32 {d18-d19}, [%[prhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n" + "vst1.f32 {d20-d21}, [%[prhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n" + "vst1.f32 {d22-d23}, [%[prhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + prhs_ptr[0] = buffer[0]; + prhs_ptr[1] = buffer[k]; + prhs_ptr[2] = buffer[k << 1]; + prhs_ptr[3] = buffer[3 * k]; + prhs_ptr[4] = buffer[k << 2]; + prhs_ptr[5] = buffer[5 * k]; + prhs_ptr[6] = buffer[6 * k]; + prhs_ptr[7] = buffer[7 * k]; + prhs_ptr += nr; + buffer++; + } + break; +#if !__aarch64__ + case 6: + if (nk > 0) + { + asm volatile("0:\n" + "mov r0, %[buffer]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d16-d17}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d18-d19}, [r0]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + "vzip.32 q8, q9\n" + + "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n" + "vst1.f32 {d16}, [%[prhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n" + "vst1.f32 {d17}, [%[prhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n" + "vst1.f32 {d18}, [%[prhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n" + "vst1.f32 {d19}, [%[prhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7", "q8", "q9"); + } + + for (int j = 0; j < rk; j++) + { + prhs_ptr[0] = buffer[0]; + prhs_ptr[1] = buffer[k]; + prhs_ptr[2] = buffer[k << 1]; + prhs_ptr[3] = buffer[3 * k]; + prhs_ptr[4] = buffer[k << 2]; + prhs_ptr[5] = buffer[5 * k]; + prhs_ptr += nr; + buffer++; + } + break; +#endif // !__aarch64__ + case 4: + if (nk > 0) + { +#if __aarch64__ + asm volatile("0:\n" + "mov x0, %[buffer]\n" + + "ld1 {v4.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v5.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v6.4s}, [x0]\n" + "add x0, x0, %[_stride]\n" + "ld1 {v7.4s}, [x0]\n" + + "zip1 v28.4s, v4.4s, v6.4s\n" + "zip2 v30.4s, v4.4s, v6.4s\n" + "zip1 v29.4s, v5.4s, v7.4s\n" + "zip2 v31.4s, v5.4s, v7.4s\n" + "zip1 v4.4s, v28.4s, v29.4s\n" + "zip2 v5.4s, v28.4s, v29.4s\n" + "zip1 v6.4s, v30.4s, v31.4s\n" + "zip2 v7.4s, v30.4s, v31.4s\n" + + "st1 {v4.4s}, [%[prhs_ptr]], #16\n" + "st1 {v5.4s}, [%[prhs_ptr]], #16\n" + "st1 {v6.4s}, [%[prhs_ptr]], #16\n" + "st1 {v7.4s}, [%[prhs_ptr]], #16\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "x0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31"); +#else // __aarch64__ + asm volatile("0:\n" + "mov r0, %[buffer]\n" + + "vld1.f32 {d8-d9}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d10-d11}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d12-d13}, [r0]\n" + "add r0, r0, %[_stride]\n" + "vld1.f32 {d14-d15}, [r0]\n" + + "vzip.32 q4, q6\n" + "vzip.32 q5, q7\n" + "vzip.32 q4, q5\n" + "vzip.32 q6, q7\n" + + "vst1.f32 {d8-d9}, [%[prhs_ptr]]!\n" + "vst1.f32 {d10-d11}, [%[prhs_ptr]]!\n" + "vst1.f32 {d12-d13}, [%[prhs_ptr]]!\n" + "vst1.f32 {d14-d15}, [%[prhs_ptr]]!\n" + + "subs %[nk], %[nk], #1\n" + "add %[buffer], %[buffer], #16\n" + "bne 0b\n" + : [buffer] "+r"(buffer), [prhs_ptr] "+r"(prhs_ptr), [nk] "+r"(nk) + : [_stride] "r"(_stride) + : "cc", "memory", "r0", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + + for (int j = 0; j < rk; j++) + { + prhs_ptr[0] = buffer[0]; + prhs_ptr[1] = buffer[k]; + prhs_ptr[2] = buffer[k << 1]; + prhs_ptr[3] = buffer[3 * k]; + prhs_ptr += nr; + buffer++; + } + break; + default: + break; + } +} + +void _pack_colmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr) +{ + _pack_rowmajor_notrans_rhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr); +} + +void _pack_colmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr) +{ + _pack_rowmajor_notrans_lhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr); +} + +void _pack_colmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr) +{ + _pack_rowmajor_notrans_lhs(mr, mb, kb, stride, lhs_ptr, plhs_ptr); +} + +void _pack_colmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr) +{ + _pack_rowmajor_notrans_rhs(nr, nb, kb, stride, rhs_ptr, prhs_ptr); +} + +void _pack_colmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *prhs_ptr) +{ + const int w = input->w; + const int h = input->h; + const int c = input->c; + const int outw = output->w; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const float *input_data = input->data; + + int c0 = c - k0 % c; + if (c0 > kb) + c0 = kb; + int nc = (kb - c0 + c - 1) / c; + if (c0) + nc++; + const int cn = (kb - c0) % c; + + int seg0 = outw - n0 % outw; + if (seg0 > nb) + seg0 = nb; + int rows = (nb - seg0 + outw - 1) / outw; + if (seg0) + rows++; + const int segn = (nb - seg0) % outw; + + const int in_row0 = n0 / outw * stride_h; + const int in_col0 = n0 % outw * stride_w; + + for (int i = 0; i < nc; i++) + { + const int channels = (i == 0 && c0 != 0) ? c0 : ((i == nc - 1 && cn != 0) ? cn : c); + const int c1 = (i == 0) ? k0 % c : 0; + + float tmp_data[channels * nr]; + int nindex = 0; + float *buffer = tmp_data; + float *prhs_tmp = prhs_ptr; + + const int in_row1 = (k0 / c + i) / kernel_w % kernel_h * params->dilation_h + in_row0; + const int in_col1 = (k0 / c + i) % kernel_w * params->dilation_w; + + int in_row = in_row1 - pad_h; + + for (int out_rows = rows; out_rows; out_rows--) + { + int cols = (out_rows != 1 || segn == 0) ? outw : segn; + int in_col = in_col1 - pad_w; + if (out_rows == rows) + { + cols = seg0; + in_col += in_col0; + } + if ((unsigned int)in_row < (unsigned int)h) + { + for (int out_col = cols; out_col; out_col--) + { + if ((unsigned int)in_col < (unsigned int)w) + { + for (int j = c1; j < c1 + channels; j++) + { + *(buffer++) = input_data[(in_row * w + in_col) * c + j]; + } + } + else + { + for (int j = 0; j < channels; j++) + { + *(buffer++) = 0; + } + } + in_col += stride_w; + + nindex++; + if (nindex == nr) + { + nindex = 0; + buffer = tmp_data; + _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp); + prhs_tmp += kb * nr; + } + } + } + else + { + for (int out_col = cols; out_col; out_col--) + { + for (int j = 0; j < channels; j++) + { + *(buffer++) = 0; + } + in_col += stride_w; + + nindex++; + if (nindex == nr) + { + nindex = 0; + buffer = tmp_data; + _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp); + prhs_tmp += kb * nr; + } + } + } + + in_row += stride_h; + } + + if (nindex > 0) + { + float *data = tmp_data; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < nindex; j++) + { + prhs_tmp[j] = data[j * channels]; + } + for (int j = nindex; j < nr; j++) + { + prhs_tmp[j] = 0.f; + } + prhs_tmp += nr; + data++; + } + } + + prhs_ptr += channels * nr; + } +} + +void _pack_colmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0, + const int n0, convMat_t *input, convMat_t *output, + convParams_t *params, float *prhs_ptr) +{ + const int w = input->w; + const int h = input->h; + const int c = input->c; + const int outw = output->w; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + + int c0 = c - k0 % c; + if (c0 > kb) + c0 = kb; + int nc = (kb - c0 + c - 1) / c; + if (c0) + nc++; + const int cn = (kb - c0) % c; + + const int seg_size = output->w * output->h; + + const float *indata = input->data + (w * h * c) * (n0 / seg_size); + + int bseg0 = seg_size - n0 % seg_size; + if (bseg0 > nb) + bseg0 = nb; + int bnseg = (nb - bseg0 + seg_size - 1) / seg_size; + if (bseg0) + bnseg++; + const int bsegn = (nb - bseg0) % seg_size; + + for (int ll = 0; ll < nc; ll++) + { + const float *input_data = indata; + + const int channels = (ll == 0 && c0 != 0) ? c0 : ((ll == nc - 1 && cn != 0) ? cn : c); + const int c1 = (ll == 0) ? k0 % c : 0; + + int nindex = 0; + float *prhs_tmp = prhs_ptr; + float tmp_data[channels * nr]; + float *buffer = tmp_data; + + for (int i = 0; i < bnseg; i++) + { + const int _nb = + ((i == 0 && bseg0 != 0) ? bseg0 : ((i == bnseg - 1 && bsegn != 0) ? bsegn : seg_size)); + const int _n0 = (i == 0 ? n0 % seg_size : 0); + + int seg0 = outw - _n0 % outw; + if (seg0 > _nb) + seg0 = _nb; + int rows = (_nb - seg0 + outw - 1) / outw; + if (seg0) + rows++; + const int segn = (_nb - seg0) % outw; + + const int in_row0 = _n0 / outw * stride_h; + const int in_col0 = _n0 % outw * stride_w; + + const int in_row1 = (k0 / c + ll) / kernel_w % kernel_h + in_row0; + const int in_col1 = (k0 / c + ll) % kernel_w; + + int in_row = in_row1; + + for (int out_rows = rows; out_rows; out_rows--) + { + int cols = (out_rows != 1 || segn == 0) ? outw : segn; + int in_col = in_col1; + if (out_rows == rows) + { + cols = seg0; + in_col += in_col0; + } + if ((unsigned int)in_row < (unsigned int)h) + { + for (int out_col = cols; out_col; out_col--) + { + if ((unsigned int)in_col < (unsigned int)w) + { + for (int j = c1; j < c1 + channels; j++) + { + *(buffer++) = input_data[(in_row * w + in_col) * c + j]; + } + } + else + { + for (int j = 0; j < channels; j++) + { + *(buffer++) = 0; + } + } + in_col += stride_w; + + nindex++; + if (nindex == nr) + { + nindex = 0; + buffer = tmp_data; + _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp); + prhs_tmp += kb * nr; + } + } + } + else + { + for (int out_col = cols; out_col; out_col--) + { + for (int j = 0; j < channels; j++) + { + *(buffer++) = 0; + } + in_col += stride_w; + + nindex++; + if (nindex == nr) + { + nindex = 0; + buffer = tmp_data; + _pack_colmajor_image_rhs_sub(nr, channels, tmp_data, prhs_tmp); + prhs_tmp += kb * nr; + } + } + } + + in_row += stride_h; + } + + input_data += w * h * c; + } + + if (nindex > 0) + { + float *data = tmp_data; + for (int ii = 0; ii < channels; ii++) + { + for (int jj = 0; jj < nindex; jj++) + { + prhs_tmp[jj] = data[jj * channels]; + } + for (int jj = nindex; jj < nr; jj++) + { + prhs_tmp[jj] = 0.f; + } + prhs_tmp += nr; + data++; + } + } + + prhs_ptr += channels * nr; + } +} + +void _unpack_colmajor_image_res(const int mb, const int nb, const int m0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *pres_ptr) +{ + const int w = input->w; + const int outw = output->w; + const int outh = output->h; + const int outc = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + float *output_data = output->data; + + int c0 = outc - m0 % outc; + if (c0 > mb) + c0 = mb; + int nc = (mb - c0 + outc - 1) / outc; + if (c0) + nc++; + const int cn = (mb - c0) % outc; + + int seg0 = w - n0 % w; + if (seg0 > nb) + seg0 = nb; + int rows = (nb - seg0 + w - 1) / w; + if (seg0) + rows++; + const int segn = (nb - seg0) % w; + + const int out_row0 = n0 / w * stride_h; + const int out_col0 = n0 % w * stride_w; + + for (int i = 0; i < nc; i++) + { + const int channels = (i == 0 && c0 != 0) ? c0 : ((i == nc - 1 && cn != 0) ? cn : outc); + const int c1 = (i == 0) ? m0 % outc : 0; + + float *buffer = pres_ptr; + + const int out_row1 = (m0 / outc + i) / kernel_w % kernel_h * params->dilation_h + out_row0; + const int out_col1 = (m0 / outc + i) % kernel_w * params->dilation_w; + + int out_row = out_row1 - pad_h; + + for (int in_rows = rows; in_rows; in_rows--) + { + int cols = (in_rows != 1 || segn == 0) ? w : segn; + int out_col = out_col1 - pad_w; + if (in_rows == rows) + { + cols = seg0; + out_col += out_col0; + } + if ((unsigned int)out_row < (unsigned int)outh) + { + for (int in_col = cols; in_col; in_col--) + { + if ((unsigned int)out_col < (unsigned int)outw) + { + for (int j = c1; j < c1 + channels; j++) + { + // Note:Data competition for multi-threads + //#pragma omp atomic //low performance + output_data[(out_row * outw + out_col) * outc + j] += *(buffer + j - c1); + } + } + buffer += mb; + out_col += stride_w; + } + } + else + { + buffer += cols * mb; + } + out_row += stride_h; + } + + pres_ptr += channels; + } +} + +void _sparse_pack_rowmajor_image(const int nb, const int k0, const int n0, convMat_t *input, + convMat_t *output, convParams_t *params, float *prhs_ptr) +{ + const int w = input->w; + const int h = input->h; + const int outw = output->w; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + + const int in_row0 = n0 / outw * stride_h; + const int in_col0 = n0 % outw * stride_w; + int seg0 = outw - n0 % outw; + if (seg0 > nb) + seg0 = nb; + int rows = (nb - seg0 + outw - 1) / outw; + if (seg0) + rows++; + const int segn = (nb - seg0) % outw; + + const int ic = k0 / (kernel_w * kernel_h); + const int in_row1 = ((k0 / kernel_w) % kernel_h) * params->dilation_h + in_row0; + const int in_col1 = k0 % kernel_w * params->dilation_w; + +#ifdef NCNN + const float *input_data = input->data + ic * alignSize(w * h, 16 / sizeof(float)); +#else // NCNN + const float *input_data = input->data + ic * w * h; +#endif // NCNN + + int in_row = in_row1 - pad_h; + + for (int out_rows = rows; out_rows; out_rows--) + { + int cols = (out_rows != 1 || segn == 0) ? outw : segn; + int in_col = in_col1 - pad_w; + if (out_rows == rows) + { + cols = seg0; + in_col += in_col0; + } + if ((unsigned int)in_row < (unsigned int)h) + { + for (int out_col = cols; out_col; out_col--) + { + if ((unsigned int)in_col < (unsigned int)w) + *(prhs_ptr++) = input_data[in_row * w + in_col]; + else + *(prhs_ptr++) = 0; + in_col += stride_w; + } + } + else + { + for (int out_col = cols; out_col; out_col--) + { + *(prhs_ptr++) = 0; + in_col += stride_w; + } + } + + in_row += stride_h; + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/sgemm_pack.h b/compute/ncnn/src/srcn/sgemm_pack.h new file mode 100644 index 000000000..d64843ebb --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_pack.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_SGEMM_PACK_H__ +#define __NNFW_SRCN_SGEMM_PACK_H__ + +#include "ncnn/srcn/conv_type.h" + +namespace nnfw +{ +namespace srcn +{ + +void _pack_rowmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr); +void _pack_rowmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr); +void _pack_rowmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr); +void _pack_rowmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr); +void _pack_rowmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *prhs_ptr); +void _pack_rowmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0, + const int n0, convMat_t *input, convMat_t *output, + convParams_t *params, float *prhs_ptr); + +void _unpack_rowmajor_image_res(const int mb, const int nb, const int m0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *pres_ptr); + +void _pack_colmajor_notrans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr); +void _pack_colmajor_notrans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr); +void _pack_colmajor_trans_lhs(const int mr, const int mb, const int kb, const int stride, + const float *lhs_ptr, float *plhs_ptr); +void _pack_colmajor_trans_rhs(const int nr, const int nb, const int kb, const int stride, + const float *rhs_ptr, float *prhs_ptr); + +void _pack_colmajor_image_rhs(const int nr, const int nb, const int kb, const int k0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *prhs_ptr); + +void _pack_colmajor_image_rhs_batch(const int nr, const int nb, const int kb, const int k0, + const int n0, convMat_t *input, convMat_t *output, + convParams_t *params, float *prhs_ptr); + +void _unpack_colmajor_image_res(const int mb, const int nb, const int m0, const int n0, + convMat_t *input, convMat_t *output, convParams_t *params, + float *pres_ptr); + +void _sparse_pack_rowmajor_image(const int nb, const int k0, const int n0, convMat_t *input, + convMat_t *output, convParams_t *params, float *prhs_ptr); + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_SGEMM_PACK_H__ diff --git a/compute/ncnn/src/srcn/sgemm_singlethread.cc b/compute/ncnn/src/srcn/sgemm_singlethread.cc new file mode 100644 index 000000000..3de3e1214 --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_singlethread.cc @@ -0,0 +1,689 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdexcept> + +#include "common.h" +#include "sgemm_kernel.h" +#include "sgemm_pack.h" +#include "sgemm_singlethread.h" + +namespace nnfw +{ +namespace srcn +{ + +void sgemm_singlethread::param_init() +{ + if (n_ >= m_) + { + shard_type_ = shardByRow; + } + else + { + shard_type_ = shardByCol; + } + +#if __aarch64__ + if (major_type_ == rowMajor) + { + if (shard_type_ == shardByRow) + { + mr_ = 8; + nr_ = 12; + } + else + { + mr_ = 12; + nr_ = 8; + } + } + else if (major_type_ == colMajor) + { + mr_ = 12; + nr_ = 8; + } +#else // __aarch64__ + if (major_type_ == rowMajor) + { + // it is a bug, but i do not know why as now. + if (ltrans_ == notrans && rtrans_ == trans) + { + mr_ = 4; + nr_ = 12; + } + else + { + mr_ = 6; + nr_ = 8; + } + } + else if (major_type_ == colMajor) + { + mr_ = 8; + nr_ = 6; + } +#endif // __aarch64__ + + int k_div = (nr_ * sizeof_RhsScalar); + int k_sub = (mr_ * nr_ * sizeof_ResScalar); + + int gen_col = GEN_COL / cache_div_; + int min_k = MAX_K / cache_div_; + + const int k_cache = MIN(divup((int)(L1_CACHE_SIZE - k_sub), (int)k_div), min_k); + bk_ = MIN(k_cache, k_); + + if (shard_type_ == shardByCol) + { + int m_sub = (bk_ * nr_ * sizeof_RhsScalar); + int m_div = (sizeof_LhsScalar * bk_ * 2 * cache_div_); + if (L3_CACHE_SIZE) + m_div = (sizeof_LhsScalar * bk_ * 2); + int m_cache = divup((L2_CACHE_SIZE - m_sub), m_div); + bm_ = MIN(m_cache, m_); + + bn_ = MIN(gen_col, n_); + if (L3_CACHE_SIZE) + { + int n_sub = (bk_ * bm_ * sizeof_RhsScalar); + int n_cache = divup((L3_CACHE_SIZE - n_sub), (sizeof_LhsScalar * bk_ * 2)); + bn_ = MIN(n_cache, bn_); + } + } + else + { + int n_sub = (bk_ * mr_ * sizeof_RhsScalar); + int n_div = (sizeof_LhsScalar * bk_ * 2 * cache_div_); + if (L3_CACHE_SIZE) + n_div = (sizeof_LhsScalar * bk_ * 2); + int n_cache = divup((L2_CACHE_SIZE - n_sub), n_div); + bn_ = MIN(n_cache, n_); + + bm_ = MIN(gen_col, m_); + if (L3_CACHE_SIZE) + { + int m_sub = (bk_ * bn_ * sizeof_RhsScalar); + int m_cache = divup((L3_CACHE_SIZE - m_sub), (sizeof_LhsScalar * bk_ * 2)); + bm_ = MIN(m_cache, bm_); + } + } + + nm_ = divup(m_, bm_); + nn_ = divup(n_, bn_); + nk_ = divup(k_, bk_); + + rm_ = m_ % bm_; + rn_ = n_ % bn_; + rk_ = k_ % bk_; +} + +sgemm_singlethread::sgemm_singlethread(sgemmType_t major_type, sgemmTrans_t ltrans, + sgemmTrans_t rtrans, const int m, const int n, const int k, + const float *lhs_data, const float *rhs_data, + float *res_data, int cache_div) + : lhs_data_(lhs_data), rhs_data_(rhs_data), res_data_(res_data), major_type_(major_type), + ltrans_(ltrans), rtrans_(rtrans), m_(m), n_(n), k_(k), cache_div_(cache_div) +{ + param_init(); +} + +sgemm_singlethread::~sgemm_singlethread() {} + +void sgemm_singlethread::run() +{ + if (major_type_ == rowMajor) + { + if (ltrans_ == notrans && rtrans_ == notrans) + { + compute_rowmajor_nn(); + } + else if (ltrans_ == notrans && rtrans_ == trans) + { + compute_rowmajor_nt(); + } + else if (ltrans_ == trans && rtrans_ == notrans) + { + compute_rowmajor_tn(); + } + else if (ltrans_ == trans && rtrans_ == trans) + { + compute_rowmajor_tt(); + } + else + { + throw std::runtime_error{"error trans type."}; + } + } + else if (major_type_ == colMajor) + { + if (ltrans_ == notrans && rtrans_ == notrans) + { + compute_colmajor_nn(); + } + else if (ltrans_ == notrans && rtrans_ == trans) + { + compute_colmajor_nt(); + } + else if (ltrans_ == trans && rtrans_ == notrans) + { + compute_colmajor_tn(); + } + else if (ltrans_ == trans && rtrans_ == trans) + { + compute_colmajor_tt(); + } + else + { + throw std::runtime_error{"error trans type."}; + } + } + else + { + throw std::runtime_error{"error major type."}; + } +} + +void sgemm_singlethread::compute_rowmajor_nn() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_rowmajor_nt() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_notrans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_rowmajor_tn() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_rowmajor_notrans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_rowmajor_tt() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + _sgemm_rowmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_rowmajor_trans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_rowmajor_trans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + _sgemm_rowmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[i * bm_ * n_ + j * bn_], l, n_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_colmajor_nn() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_colmajor_nt() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_notrans_lhs(mr_, bm, bk, m_, &lhs_data_[l * bk_ * m_ + i * bm_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_colmajor_tn() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_colmajor_notrans_rhs(nr_, bn, bk, k_, &rhs_data_[j * bn_ * k_ + l * bk_], prhs_ptr); + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +void sgemm_singlethread::compute_colmajor_tt() +{ + int mstride = (bm_ + mr_ - 1) / mr_ * mr_; + int nstride = (bn_ + nr_ - 1) / nr_ * nr_; + + float plhs_ptr[mstride * bk_]; + float prhs_ptr[nstride * bk_]; + + if (shard_type_ == shardByCol) + { + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + _sgemm_colmajor_macro_kernel_divnm(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else if (shard_type_ == shardByRow) + { + for (int i = 0; i < nm_; i++) + { + const int bm = (i != nm_ - 1 || rm_ == 0) ? bm_ : rm_; + + for (int l = 0; l < nk_; l++) + { + const int bk = (l != nk_ - 1 || rk_ == 0) ? bk_ : rk_; + + _pack_colmajor_trans_lhs(mr_, bm, bk, k_, &lhs_data_[i * bm_ * k_ + l * bk_], plhs_ptr); + + for (int j = 0; j < nn_; j++) + { + const int bn = (j != nn_ - 1 || rn_ == 0) ? bn_ : rn_; + + _pack_colmajor_trans_rhs(nr_, bn, bk, n_, &rhs_data_[l * bk_ * n_ + j * bn_], prhs_ptr); + + _sgemm_colmajor_macro_kernel_divmn(mr_, nr_, bm, bn, bk, plhs_ptr, prhs_ptr, + &res_data_[j * bn_ * m_ + i * bm_], l, m_, bk); + } + } + } + } + else + { + throw std::runtime_error{"error shard type."}; + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/sgemm_singlethread.h b/compute/ncnn/src/srcn/sgemm_singlethread.h new file mode 100644 index 000000000..47954e028 --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_singlethread.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_SGEMM_SINGLETHREAD_H__ +#define __NNFW_SRCN_SGEMM_SINGLETHREAD_H__ + +#include "common.h" + +namespace nnfw +{ +namespace srcn +{ + +typedef enum { rowMajor = 0, colMajor } sgemmType_t; + +typedef enum { trans = 0, notrans } sgemmTrans_t; + +class sgemm_singlethread +{ +public: + sgemm_singlethread(sgemmType_t major_type, sgemmTrans_t ltrans, sgemmTrans_t rtrans, const int m, + const int n, const int k, const float *lhs_data, const float *rhs_data, + float *res_data, int cache_div); + ~sgemm_singlethread(); + + void run(); + +private: + void param_init(); + + void compute_rowmajor_nn(); + void compute_rowmajor_nt(); + void compute_rowmajor_tn(); + void compute_rowmajor_tt(); + + void compute_colmajor_nn(); + void compute_colmajor_nt(); + void compute_colmajor_tn(); + void compute_colmajor_tt(); + + const float *lhs_data_; + const float *rhs_data_; + float *res_data_; + + sgemmType_t major_type_; + sgemmTrans_t ltrans_; + sgemmTrans_t rtrans_; + + int m_; + int n_; + int k_; + + int bm_; + int bn_; + int bk_; + + int rm_; + int rn_; + int rk_; + + int nm_; + int nn_; + int nk_; + + int mr_; + int nr_; + + shardType_t shard_type_; + int cache_div_; +}; + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_SGEMM_SINGLETHREAD_H__ diff --git a/compute/ncnn/src/srcn/sgemm_test.cc b/compute/ncnn/src/srcn/sgemm_test.cc new file mode 100644 index 000000000..1b10970bb --- /dev/null +++ b/compute/ncnn/src/srcn/sgemm_test.cc @@ -0,0 +1,1883 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <unistd.h> + +#include "ncnn/srcn/conv_type.h" +#include "srcn/srcn_conv.h" +//#include "srcn_sgemm.h" +#include "conv_sgemm_singlethread.h" +#include "conv_sgemm_multithreads.h" +//#include "conv_sgemm_batch.h" +#include "sgemm_singlethread.h" +#include "conv_winograd.h" +#include "winograd.h" + +//#include "conv_gpu.h" +//#include "convolutiondepthwise_3x3.h" + +namespace nnfw +{ +namespace srcn +{ + +static void direct_conv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int out_c = 0; out_c < outch; out_c++) + { + for (int out_row = 0; out_row < outh; out_row++) + { + for (int out_col = 0; out_col < outw; out_col++) + { + const int in_col0 = (out_col * stride_w) - pad_w; + const int in_row0 = (out_row * stride_h) - pad_h; + float sum = 0.f; + for (int in_c = 0; in_c < inch; in_c++) + { + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int in_col = in_col0 + filter_x * dilation_w; + const int in_row = in_row0 + filter_y * dilation_h; + + if (((unsigned int)in_col < (unsigned int)w) && + ((unsigned int)in_row < (unsigned int)h)) + { + float input_value = input_data[(in_c * h + in_row) * w + in_col]; + float filter_value = + filter_data[((out_c * inch + in_c) * kernel_h + filter_y) * kernel_w + + filter_x]; + sum += (input_value * filter_value); + } + } + } + } + output_data[(out_c * outh + out_row) * outw + out_col] = sum; + } + } + } +} + +static void direct_deconv_rowmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int i = 0; i < outw * outh * outch; i++) + { + output_data[i] = 0; + } + + for (int in_c = 0; in_c < inch; in_c++) + { + for (int in_row = 0; in_row < h; in_row++) + { + for (int in_col = 0; in_col < w; in_col++) + { + const int out_col0 = (in_col * stride_w) - pad_w; + const int out_row0 = (in_row * stride_h) - pad_h; + float in_value = input_data[(in_c * h + in_row) * w + in_col]; + for (int out_c = 0; out_c < outch; out_c++) + { + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int out_col = out_col0 + filter_x * dilation_w; + const int out_row = out_row0 + filter_y * dilation_h; + + if (((unsigned int)out_col < (unsigned int)outw) && + ((unsigned int)out_row < (unsigned int)outh)) + { + float filter_value = + filter_data[((in_c * outch + out_c) * kernel_h + filter_y) * kernel_w + + filter_x]; + output_data[(out_c * outh + out_row) * outw + out_col] += filter_value * in_value; + } + } + } + } + } + } + } +} + +static void direct_sgemm_rowmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B, + float *C) +{ + float *aa, *bb; + + if (Atrans == trans) + { + aa = (float *)malloc(m * k * sizeof(float)); + if (!aa) + return; + + for (int i = 0; i < k; i++) + { + for (int j = 0; j < m; j++) + { + aa[j * k + i] = A[i * m + j]; + } + } + } + else + { + aa = A; + } + + if (Btrans == trans) + { + bb = (float *)malloc(n * k * sizeof(float)); + if (!bb) + return; + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < k; j++) + { + bb[j * n + i] = B[i * k + j]; + } + } + } + else + { + bb = B; + } + + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + float res = 0.f; + for (int l = 0; l < k; l++) + { + res += aa[i * k + l] * bb[l * n + j]; + } + C[i * n + j] = res; + } + } +} + +/*static void direct_sgemm_kernel(const int k, const int lhs_stride, const int rhs_stride, const int +res_stride, + const float *lhs_ptr, const float *rhs_ptr, float *res_ptr) +{ + int lstride = lhs_stride << 2; + int rstride = rhs_stride << 2; + int estride = res_stride << 2; + int rstep = rstride << 2; + + int nk = (k >> 2) - 1; + + __asm __volatile ( + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + "mov x0, %[lhs_ptr]\n" + "add %[lhs_ptr], %[lhs_ptr], #16\n" + "ld1 {v0.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v1.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v2.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + "ld1 {v3.4s}, [x0]\n" + "add x0, x0, %[lstride]\n" + + "mov x1, %[rhs_ptr]\n" + "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n" + "ld1 {v8.4s, v9.4s}, [x1]\n" + "add x1, x1, %[rstride]\n" + "ld1 {v10.4s, v11.4s}, [x1]\n" + "add x1, x1, %[rstride]\n" + + "1:\n" + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v16.4s, v10.4s, v0.s[1]\n" + "fmla v17.4s, v11.4s, v0.s[1]\n" + "fmla v18.4s, v8.4s, v1.s[0]\n" + "fmla v19.4s, v9.4s, v1.s[0]\n" + "fmla v18.4s, v10.4s, v1.s[1]\n" + "fmla v19.4s, v11.4s, v1.s[1]\n" + "ld1 {v12.4s, v13.4s}, [x1]\n" + "fmla v20.4s, v8.4s, v2.s[0]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v9.4s, v2.s[0]\n" + "ld1 {v14.4s, v15.4s}, [x1]\n" + "fmla v20.4s, v10.4s, v2.s[1]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v11.4s, v2.s[1]\n" + "fmla v22.4s, v8.4s, v3.s[0]\n" + "fmla v23.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v3.s[1]\n" + "fmla v23.4s, v11.4s, v3.s[1]\n" + + "ld1 {v4.4s}, [x0]\n" + "fmla v16.4s, v12.4s, v0.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v13.4s, v0.s[2]\n" + "ld1 {v5.4s}, [x0]\n" + "fmla v16.4s, v14.4s, v0.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v15.4s, v0.s[3]\n" + "ld1 {v6.4s}, [x0]\n" + "fmla v18.4s, v12.4s, v1.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v13.4s, v1.s[2]\n" + "ld1 {v7.4s}, [x0]\n" + "fmla v18.4s, v14.4s, v1.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v15.4s, v1.s[3]\n" + "fmla v20.4s, v12.4s, v2.s[2]\n" + "fmla v21.4s, v13.4s, v2.s[2]\n" + "fmla v20.4s, v14.4s, v2.s[3]\n" + "fmla v21.4s, v15.4s, v2.s[3]\n" + "fmla v22.4s, v12.4s, v3.s[2]\n" + "fmla v23.4s, v13.4s, v3.s[2]\n" + "fmla v22.4s, v14.4s, v3.s[3]\n" + "fmla v23.4s, v15.4s, v3.s[3]\n" + + "mov x0, %[lhs_ptr]\n" + "add %[lhs_ptr], %[lhs_ptr], #16\n" + + "fmla v24.4s, v8.4s, v4.s[0]\n" + "fmla v25.4s, v9.4s, v4.s[0]\n" + "ld1 {v0.4s}, [x0]\n" + "fmla v24.4s, v10.4s, v4.s[1]\n" + "add x0, x0, %[lstride]\n" + "fmla v25.4s, v11.4s, v4.s[1]\n" + "ld1 {v1.4s}, [x0]\n" + "fmla v26.4s, v8.4s, v5.s[0]\n" + "add x0, x0, %[lstride]\n" + "fmla v27.4s, v9.4s, v5.s[0]\n" + "ld1 {v2.4s}, [x0]\n" + "fmla v26.4s, v10.4s, v5.s[1]\n" + "add x0, x0, %[lstride]\n" + "fmla v27.4s, v11.4s, v5.s[1]\n" + "ld1 {v3.4s}, [x0]\n" + "fmla v28.4s, v8.4s, v6.s[0]\n" + "add x0, x0, %[lstride]\n" + "fmla v29.4s, v9.4s, v6.s[0]\n" + "fmla v28.4s, v10.4s, v6.s[1]\n" + "fmla v29.4s, v11.4s, v6.s[1]\n" + "fmla v30.4s, v8.4s, v7.s[0]\n" + "fmla v31.4s, v9.4s, v7.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + + "mov x1, %[rhs_ptr]\n" + "add %[rhs_ptr], %[rhs_ptr], %[rstep]\n" + + "fmla v24.4s, v12.4s, v4.s[2]\n" + "fmla v25.4s, v13.4s, v4.s[2]\n" + "ld1 {v8.4s, v9.4s}, [x1]\n" + "fmla v24.4s, v14.4s, v4.s[3]\n" + "add x1, x1, %[rstride]\n" + "fmla v25.4s, v15.4s, v4.s[3]\n" + "ld1 {v10.4s, v11.4s}, [x1]\n" + "fmla v26.4s, v12.4s, v5.s[2]\n" + "add x1, x1, %[rstride]\n" + "fmla v27.4s, v13.4s, v5.s[2]\n" + "fmla v26.4s, v14.4s, v5.s[3]\n" + "fmla v27.4s, v15.4s, v5.s[3]\n" + "fmla v28.4s, v12.4s, v6.s[2]\n" + "fmla v29.4s, v13.4s, v6.s[2]\n" + "fmla v28.4s, v14.4s, v6.s[3]\n" + "fmla v29.4s, v15.4s, v6.s[3]\n" + "fmla v30.4s, v12.4s, v7.s[2]\n" + "fmla v31.4s, v13.4s, v7.s[2]\n" + "subs %w[nk], %w[nk], #1\n" + "fmla v30.4s, v14.4s, v7.s[3]\n" + "fmla v31.4s, v15.4s, v7.s[3]\n" + "bne 1b\n" + + "fmla v16.4s, v8.4s, v0.s[0]\n" + "fmla v17.4s, v9.4s, v0.s[0]\n" + "fmla v16.4s, v10.4s, v0.s[1]\n" + "fmla v17.4s, v11.4s, v0.s[1]\n" + "fmla v18.4s, v8.4s, v1.s[0]\n" + "fmla v19.4s, v9.4s, v1.s[0]\n" + "fmla v18.4s, v10.4s, v1.s[1]\n" + "fmla v19.4s, v11.4s, v1.s[1]\n" + "ld1 {v12.4s, v13.4s}, [x1]\n" + "fmla v20.4s, v8.4s, v2.s[0]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v9.4s, v2.s[0]\n" + "ld1 {v14.4s, v15.4s}, [x1]\n" + "fmla v20.4s, v10.4s, v2.s[1]\n" + "add x1, x1, %[rstride]\n" + "fmla v21.4s, v11.4s, v2.s[1]\n" + "fmla v22.4s, v8.4s, v3.s[0]\n" + "fmla v23.4s, v9.4s, v3.s[0]\n" + "fmla v22.4s, v10.4s, v3.s[1]\n" + "fmla v23.4s, v11.4s, v3.s[1]\n" + + "ld1 {v4.4s}, [x0]\n" + "fmla v16.4s, v12.4s, v0.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v13.4s, v0.s[2]\n" + "ld1 {v5.4s}, [x0]\n" + "fmla v16.4s, v14.4s, v0.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v17.4s, v15.4s, v0.s[3]\n" + "ld1 {v6.4s}, [x0]\n" + "fmla v18.4s, v12.4s, v1.s[2]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v13.4s, v1.s[2]\n" + "ld1 {v7.4s}, [x0]\n" + "fmla v18.4s, v14.4s, v1.s[3]\n" + "add x0, x0, %[lstride]\n" + "fmla v19.4s, v15.4s, v1.s[3]\n" + "fmla v20.4s, v12.4s, v2.s[2]\n" + "fmla v21.4s, v13.4s, v2.s[2]\n" + "fmla v20.4s, v14.4s, v2.s[3]\n" + "fmla v21.4s, v15.4s, v2.s[3]\n" + "fmla v22.4s, v12.4s, v3.s[2]\n" + "fmla v23.4s, v13.4s, v3.s[2]\n" + "fmla v22.4s, v14.4s, v3.s[3]\n" + "fmla v23.4s, v15.4s, v3.s[3]\n" + + "mov x0, %[res_ptr]\n" + "fmla v24.4s, v8.4s, v4.s[0]\n" + "fmla v25.4s, v9.4s, v4.s[0]\n" + "st1 {v16.4s, v17.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v24.4s, v10.4s, v4.s[1]\n" + "fmla v25.4s, v11.4s, v4.s[1]\n" + "st1 {v18.4s, v19.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v8.4s, v5.s[0]\n" + "fmla v27.4s, v9.4s, v5.s[0]\n" + "st1 {v20.4s, v21.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v10.4s, v5.s[1]\n" + "fmla v27.4s, v11.4s, v5.s[1]\n" + "st1 {v22.4s, v23.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v28.4s, v8.4s, v6.s[0]\n" + "fmla v29.4s, v9.4s, v6.s[0]\n" + "fmla v28.4s, v10.4s, v6.s[1]\n" + "fmla v29.4s, v11.4s, v6.s[1]\n" + "fmla v30.4s, v8.4s, v7.s[0]\n" + "fmla v31.4s, v9.4s, v7.s[0]\n" + "fmla v30.4s, v10.4s, v7.s[1]\n" + "fmla v31.4s, v11.4s, v7.s[1]\n" + + "fmla v24.4s, v12.4s, v4.s[2]\n" + "fmla v25.4s, v13.4s, v4.s[2]\n" + "fmla v24.4s, v14.4s, v4.s[3]\n" + "fmla v25.4s, v15.4s, v4.s[3]\n" + "fmla v26.4s, v12.4s, v5.s[2]\n" + "fmla v27.4s, v13.4s, v5.s[2]\n" + "st1 {v24.4s, v25.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v26.4s, v14.4s, v5.s[3]\n" + "fmla v27.4s, v15.4s, v5.s[3]\n" + "fmla v28.4s, v12.4s, v6.s[2]\n" + "fmla v29.4s, v13.4s, v6.s[2]\n" + "st1 {v26.4s, v27.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v28.4s, v14.4s, v6.s[3]\n" + "fmla v29.4s, v15.4s, v6.s[3]\n" + "fmla v30.4s, v12.4s, v7.s[2]\n" + "fmla v31.4s, v13.4s, v7.s[2]\n" + "st1 {v28.4s, v29.4s}, [x0]\n" + "add x0, x0, %[estride]\n" + "fmla v30.4s, v14.4s, v7.s[3]\n" + "fmla v31.4s, v15.4s, v7.s[3]\n" + "st1 {v30.4s, v31.4s}, [x0]\n" + :[lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), [res_ptr] "+r" (res_ptr), + [nk] "+r" (nk) + : [lstride] "r" (lstride), [rstride] "r" (rstride), [estride] "r" (estride), [rstep] "r" +(rstep) + : "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +}*/ + +static void direct_conv_colmajor(convMat_t *input, convMat_t *output, convMat_t *filter, + convParams_t *params) +{ + const int w = input->w; + const int h = input->h; + const int inch = input->c; + const int outw = output->w; + const int outh = output->h; + const int outch = output->c; + const int kernel_w = params->kernel_w; + const int kernel_h = params->kernel_h; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + const int pad_w = params->pad_w; + const int pad_h = params->pad_h; + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const float *input_data = input->data; + const float *filter_data = filter->data; + float *output_data = output->data; + + for (int out_row = 0; out_row < outh; out_row++) + { + for (int out_col = 0; out_col < outw; out_col++) + { + const int in_col0 = (out_col * stride_w) - pad_w; + const int in_row0 = (out_row * stride_h) - pad_h; + + for (int out_c = 0; out_c < outch; out_c++) + { + float sum = 0.f; + for (int filter_y = 0; filter_y < kernel_h; filter_y++) + { + for (int filter_x = 0; filter_x < kernel_w; filter_x++) + { + const int in_col = in_col0 + filter_x * dilation_w; + const int in_row = in_row0 + filter_y * dilation_h; + + if (((unsigned int)in_col < (unsigned int)w) && + ((unsigned int)in_row < (unsigned int)h)) + { + for (int in_c = 0; in_c < inch; in_c++) + { + float input_value = input_data[(in_row * w + in_col) * inch + in_c]; + float filter_value = + filter_data[((filter_y * kernel_w + filter_x) * inch + in_c) * outch + out_c]; + sum += (input_value * filter_value); + } + } + } + } + output_data[(out_row * outw + out_col) * outch + out_c] = sum; + } + } + } +} + +static void direct_sgemm_colmajor(int Atrans, int Btrans, int m, int n, int k, float *A, float *B, + float *C) +{ + float *aa, *bb; + + if (Atrans) + { + aa = (float *)malloc(m * k * sizeof(float)); + if (!aa) + return; + + for (int i = 0; i < k; i++) + { + for (int j = 0; j < m; j++) + { + aa[i * m + j] = A[j * k + i]; + } + } + } + else + { + aa = A; + } + + if (Btrans) + { + bb = (float *)malloc(n * k * sizeof(float)); + if (!bb) + return; + + for (int i = 0; i < n; i++) + { + for (int j = 0; j < k; j++) + { + bb[i * k + j] = B[j * n + i]; + } + } + } + else + { + bb = B; + } + + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + float res = 0.f; + for (int l = 0; l < k; l++) + { + res += bb[j * k + l] * aa[l * m + i]; + } + C[j * m + i] = res; + } + } +} + +#if 0 +static int test_sgemm(int m, int n, int k, int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int mb = 180; + const int nb = 1440; + const int kb = 512; + + const int mr = 4; + const int nr = 12; + +#if 0 + const int pm = (m + mr - 1) / mr * mr; + const int pn = (n + nr - 1) / nr * nr; + const int pk = k; +#else + const int pm = (mb + mr - 1) / mr * mr; + const int pn = (nb + nr - 1) / nr * nr; + const int pk = kb; +#endif + const int nm = (m + mb - 1) / mb; + const int nn = (n + nb - 1) / nb; + const int nk = (k + kb - 1) / kb; + + const int rm = m % mb; + const int rn = n % nb; + const int rk = k % kb; + + float *A = (float *)malloc(m * k * sizeof(float)); + if(!A) return 0; + + for(int i = 0 ; i < m * k; i++) + { + A[i] = 0.001 + i * 0.000001; + } + + float *B = (float *)malloc(k * n * sizeof(float)); + if(!B) return 0; + + for(int i = 0 ; i < n * k; i++) + { + B[i] = 0.001 - i * 0.000001; + } + + float *C = (float *)malloc(m * n * sizeof(float)); + if(!C) return 0; + +#if 0 + float *PA = (float *)malloc(pm * pk * sizeof(float)); + if(!PA) return 0; + + float *PB = (float *)malloc(pk * pn * sizeof(float)); + if(!PB) return 0; +#else + float PA[pm * pk]; + float PB[pk * pn]; +#endif + + for(int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + //pack_rowmajor_notrans_lhs(mr, m, k, k, A, PA); + //pack_rowmajor_notrans_rhs(nr, n, k, n, B, PB); +#if 1 + for (int j = 0; j < nn; j++) + { + const int _nb = (j != nn - 1 || rn == 0) ? nb : rn; + for (int l = 0; l < nk; l++) + { + const int _kb = (l != nk - 1 || rk == 0) ? kb : rk; + pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + j * nb], PB); + for(int i = 0; i < nm; i++) + { + const int _mb = (i != nm - 1 || rm == 0) ? mb : rm; + pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[i * mb * k + l * kb], PA); + sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, PA, PB, &C[i * mb * n + j * nb], l, n, _kb); + //sgemm_rowmajor_macro_kernel_divnm(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk); + } + } + } +#else + for (int j = 0; j < nm; j++) + { + const int _mb = (j != nm - 1 || rm == 0) ? mb : rm; + for (int l = 0; l < nk; l++) + { + const int _kb = (l != nk - 1 || rk == 0) ? kb : rk; + pack_rowmajor_notrans_lhs(mr, _mb, _kb, 1, k, &A[j * mb * k + l * kb], PA); + for(int i = 0; i < nn; i++) + { + const int _nb = (i != nn - 1 || rn == 0) ? nb : rn; + pack_rowmajor_notrans_rhs(nr, _nb, _kb, 1, n, &B[l * kb * n + i * nb], PB); + sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, PA, PB, &C[j * mb * n + i * nb], l, n, _kb); + //sgemm_rowmajor_macro_kernel_divmn(mr, nr, _mb, _nb, _kb, &PA[i * mb * k + l * kb], &PB[l * kb * pn + j * nb], &C[i * mb * n + j * nb], l, n, pk); + } + } + } +#endif + gettimeofday(&end, NULL); + total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec))/1000; + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = &C[0]; + for(int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if((i + 1) % div == 0) printf("\n"); + } + + printf("\n"); + + c_ptr = &C[m * n - num]; + for(int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if((i + 1) % div == 0) printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m *n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops , total_size, (double)total_size/(total_time / loops)/1000000); + + free(A); + free(B); + free(C); + + //free(PA); + //free(PB); + +} +#endif + +static int test_sgemm(int m, int n, int k, int type, int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + // printf("1.\n"); + + float *A = (float *)malloc(m * k * sizeof(float)); + if (!A) + return 0; + + for (int i = 0; i < m * k; i++) + { + A[i] = 0.001 + i * 0.001; // i * 0.000001; + } + + float *B = (float *)malloc(k * n * sizeof(float)); + if (!B) + return 0; + + for (int i = 0; i < n * k; i++) + { + B[i] = 0.001 - i * 0.001; // - i * 0.000001; + } + + float *C = (float *)malloc(m * n * sizeof(float)); + if (!C) + return 0; + + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + if (type == 0) + { + // direct_sgemm_rowmajor(notrans, notrans, m, n, k, A, B, C); + direct_sgemm_colmajor(notrans, notrans, m, n, k, A, B, C); + } + + else if (type == 1) + { + class sgemm_singlethread my_gemm(colMajor, notrans, notrans, m, n, k, A, B, C, 1); + my_gemm.run(); + } + + /*else if(type == 2) + { + for(int i = 0; i < m / 8; i++) + { + for(int j = 0; j < n / 8; j++) + { + direct_sgemm_kernel(k, k, n, n, A + i * 8 * k, B + j * 8, C + i * 8 * n + j * 8); + } + } + }*/ + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = &C[0]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &C[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(A); + free(B); + free(C); + + return 0; +} + +void weight_tensorflow2caffe(float *out, float *in, int H, int W, int C, int N) +{ // HWCN ---> NCHW + for (int h = 0; h < H; ++h) + { + for (int w = 0; w < W; ++w) + { + for (int c = 0; c < C; ++c) + { + for (int n = 0; n < N; ++n) + { + int index_in = h * W * C * N + w * C * N + c * N + n; + int index_out = n * C * H * W + c * H * W + h * W + w; + // printf("%3d <--- %3d\n", index_out, index_in); + out[index_out] = in[index_in]; + } + } + } + } +} + +void trans_weight2winograd(const convMat_t &_kernel, float **winograd_weight) +{ + const double *G; + const int kernel_size = _kernel.h; + const int channels = _kernel.c; + const int num_output = _kernel.n; + + int tile_h_in_, tile_w_in_; + int M, N; + + /*Step 1: transfer weight to winograd domain*/ + if (kernel_size == 3) + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + G = winograd_para_3x3s1::getG(); + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + G = winograd_para_5x5s1::getG(); + } + + tile_h_in_ = tile_w_in_ = M; + + float *winograd_g = new float[M * M * N * N]; + if (NULL == winograd_g) + return; + kronecker_product(winograd_g, G, G, M, N, M, N); + + *winograd_weight = new float[tile_h_in_ * tile_w_in_ * channels * num_output]; + + if (NULL == *winograd_weight) + return; + + float *weight_data_tran = new float[_kernel.h * _kernel.w * _kernel.c * _kernel.n]; + if (NULL == weight_data_tran) + return; + weight_tensorflow2caffe(weight_data_tran, _kernel.data, kernel_size, kernel_size, channels, + num_output); + + class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, + channels * num_output, kernel_size * kernel_size, winograd_g, + weight_data_tran, *winograd_weight, 1); + + sgemm.run(); + + delete[] weight_data_tran; + + /*With winograd, original weight data is useless.*/ + delete[] winograd_g; +} + +static int test_conv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, const int conv_type, + const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + struct timeval start1, end1; + float total_time1 = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = 1; +#ifdef NCNN + input.data = + (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float)); +#else + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); +#endif + + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = 1; +#ifdef NCNN + output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c * + sizeof(float)); +#else + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); +#endif + + if (!output.data) + return 0; + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 0; + } + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = inch; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + +#if 1 + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } +#else + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + if ((i + 1) % 15 == 0) + filter.data[i] = 0.001 - i * 0.000001; + else + filter.data[i] = 0; + } +#endif + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + // ocl_context_t context; + size_t local_min[2]; + /** + if(conv_type == 14 || conv_type == 15 || conv_type == 6) + { + if(init_gpu(&context) < 0) return -1; + //if(conv_type ==14 || conv_type == 5) sgemm_ocltune(&context, m, n, (k < 1024 ? k : + 1024), local_min); + //else if(conv_type == 6) + { + if(kernel_size == 3) directconv_3x3S1_tune(&context, &input, &filter, &output, + local_min); + else if(kernel_size == 1) directconv_1x1S1_tune(&context, &input, &filter, &output, + local_min); + } + //local_min[0] = 1; local_min[1] = 1; + } + **/ + if (conv_type == 0) + { + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + direct_conv_rowmajor(&input, &output, &filter, ¶ms); + // direct_conv_colmajor(&input, &output, &filter, ¶ms); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + { + // printf("nloop = %d, thread_num = %d\n", nloop, thread_num); + // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major); + gettimeofday(&start, NULL); + + /*if(thread_num == 1) + { + class conv_sgemm_singlethread my_gemm(input, filter, output, params, col_major); + my_gemm.run(); + } + else + { + class conv_sgemm_multithreads my_gemm(input, filter, output, params, thread_num, + col_major); + my_gemm.run(); + }*/ + + srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major); + + // printf("sync\n"); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 2) + { + float *winograd_weight; + + // trans_weight2winograd(filter, &winograd_weight); + + winogradParams_t wparams = {params.kernel_w, + params.kernel_h, + params.stride_w, + params.stride_h, + params.dilation_w, + params.dilation_h, + 1, + w, + h, + input.c, + output.c, + thread_num, + col_major, + filter.data}; + winograd_weight = trans_weight2winograd(wparams); + + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + // class conv_winograd my_sgemm(input, output, params, col_major, winograd_weight, thread_num, + // w * h, n); + // my_sgemm.run(); + + srcn_convolution2D(input, filter, output, params, winograd_weight, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 3) + { + void *sparse_weight = trans_weight2sparse(filter); + + for (int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + srcn_sparse_convolution2D(input, output, params, sparse_weight, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + + sparse_release(outch, sparse_weight); + } /** +else if(conv_type == 4) +{ +#if 0 + cl_int err; + convlib::load_opencl("./libmali.so"); + const int mpad = (m + 4 - 1) / 4 * 4; + const int npad = (n + 4 - 1) / 4 * 4; + cl_mem lhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, mpad * k * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } + + cl_image_format rhs_format = {CL_RGBA, CL_FLOAT}; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + (size_t)npad / 4, + (size_t)k, + 0, 0, + 0, + 0, 0, 0, 0 + }; + cl_mem rhs_gpu = convlib::clCreateImage(context.context, CL_MEM_READ_ONLY | +CL_MEM_ALLOC_HOST_PTR, &rhs_format, &desc, NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } + + cl_mem rhs_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, npad * k * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1;; + } + + cl_mem res_gpu = convlib::clCreateBuffer(context.context, CL_MEM_READ_WRITE | +CL_MEM_ALLOC_HOST_PTR, mpad * npad * sizeof(float), NULL, &err); + if(err != CL_SUCCESS) + { + printf("err = %d@%s:%d\n", err, __FUNCTION__, __LINE__); + return -1; + } +#endif + for(int nloop = 0; nloop < loops + 1; nloop++) + { + gettimeofday(&start, NULL); + + //cl_mem _res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, ¶ms, local_min, +lhs_gpu, rhs_gpu, res_gpu); + + //get_result_gpu(&context, output.data + gpu_data_off, _res_gpu, m, n); + srcn_convolution2D_gpu(input, filter, output, params, row_major); + + gettimeofday(&end, NULL); + + if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 ++ start.tv_usec))/1000; + } +} +else if(conv_type == 5) +{ + + for(int nloop = 0; nloop < loops + 1; nloop++) + { + gettimeofday(&start, NULL); + + //cl_mem res_gpu = conv2D_gpu_sgemm(&context, &input, &filter, &output, ¶ms, local_min); + + //clFlush(context.cmdQueue); + gettimeofday(&start1, NULL); + #if 1 + srcn_convolution2D(input, filter, output, params, NULL, thread_num, row_major + + #endif + //usleep(80 * 1000); + gettimeofday(&end1, NULL); + total_time1 += ((end1.tv_sec * 1000000 + end1.tv_usec) - (start1.tv_sec * 1000000 + +start1.tv_usec))/1000; + + //get_result_gpu(&context, output.data + gpu_data_off, res_gpu, m, n); + + srcn_convolution2D_dpu(input, filter, output, params, row_major); + + gettimeofday(&end, NULL); + if(nloop > 0) total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 ++ start.tv_usec))/1000; + } +} +else if(conv_type == 6) +{ + for(int nloop = 0; nloop < loops; nloop++) + { + gettimeofday(&start, NULL); + + if(kernel_size == 3 && stride == 1 && padding == 0) + { + conv2D_gpu_directconv_3x3S1(&context, &input, &filter, &output, ¶ms, local_min); + } + else if(kernel_size == 1 && stride == 1 && padding == 0) + { + conv2D_gpu_directconv_1x1S1(&context, &input, &filter, &output, ¶ms, local_min); + } + + gettimeofday(&end, NULL); + total_time += ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + +start.tv_usec))/1000; + } +}**/ + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + if (conv_type < 4) + printf("[CPU RESULT]\n"); + else if (conv_type == 4) + printf("[GPU RESULT]\n"); + else if (conv_type == 5) + printf("[DPU RESULT]\n"); + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf( + "AVER Time consuming: %.2fms, CPU Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", + total_time / loops, total_time1 / loops, total_size, + (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_deconv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, const int conv_type, + const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation - 1; + int pad_h = kernel_dilation - 1; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); + if (!input.data) + return 0; + + // output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + // output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.w = stride * (w - 1) + kernel_dilation - (pad_l + pad_r); + output.h = stride * (h - 1) + kernel_dilation - (pad_t + pad_b); + output.c = outch; + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); + if (!output.data) + return 0; + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = outch; + filter.n = inch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = params.kernel_h * params.kernel_w * output.c; + const int n = input.w * input.h; + const int k = input.c; + + if (conv_type == 0) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + direct_deconv_rowmajor(&input, &output, &filter, ¶ms); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 0; + } + + srcn_deconvolution2D(input, filter, output, params, thread_num, row_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + + const int output_size = output.w * output.h * output.c; + + int div = output_size < 16 ? output_size : 16; + int num = output_size > 64 ? 64 : output_size; + + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[output_size - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_batch_conv(const int batch, const int w, const int h, const int kernel_size, + const int stride, const int inch, const int outch, const int padding, + const int conv_type, const int thread_num, const int loops) +{ + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = batch; + input.data = (float *)malloc(input.n * input.w * input.h * input.c * sizeof(float)); + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = batch; + output.data = (float *)malloc(output.n * output.w * output.h * output.c * sizeof(float)); + if (!output.data) + return 0; + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = inch; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c * input.n; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + if (conv_type == 1) + { + for (int nloop = 0; nloop < loops; nloop++) + + { + // printf("nloop = %d, thread_num = %d\n", nloop, thread_num); + // class srcn_sgemm my_gemm(input, filter, output, params, thread_num, col_major); + + gettimeofday(&start, NULL); + + srcn_batch_convolution2D(input, filter, output, params, NULL, thread_num, col_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + else if (conv_type == 2) + { + float *winograd_weight; + + // trans_weight2winograd(filter, &winograd_weight); + + winogradParams_t wparams = {params.kernel_w, + params.kernel_h, + params.stride_w, + params.stride_h, + params.dilation_w, + params.dilation_h, + input.n, + w, + h, + input.c, + output.c, + thread_num, + col_major, + filter.data}; + winograd_weight = trans_weight2winograd(wparams); + + for (int nloop = 0; nloop < loops; nloop++) + + { + gettimeofday(&start, NULL); + + srcn_batch_convolution2D(input, filter, output, params, winograd_weight, thread_num, + col_major); + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + } + } + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n * batch - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)batch * m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + + return 0; +} + +static int test_depthwise_conv(const int w, const int h, const int kernel_size, const int stride, + const int inch, const int outch, const int padding, + const int conv_type, const int thread_num, const int loops) +{ + if (outch != inch) + return -1; + struct timeval start, end; + float total_time = 0.f; + + const int dilation = 1; + + const int kernel_dilation = dilation * (kernel_size - 1) + 1; + + convMat_t input; + convMat_t output; + convMat_t filter; + convMat_t bias; + convParams_t params; + + int pad_l, pad_r, pad_t, pad_b; + if (padding) + { + int pad_w = kernel_dilation + (w - 1) / stride * stride - w; + int pad_h = kernel_dilation + (h - 1) / stride * stride - h; + pad_l = pad_w / 2; + pad_r = pad_w - pad_l; + pad_t = pad_h / 2; + pad_b = pad_h - pad_t; + } + else + { + pad_l = pad_r = pad_t = pad_b = 0; + } + + input.w = w; + input.h = h; + input.c = inch; + input.n = 1; +#ifdef NCNN + input.data = + (float *)malloc(alignSize(input.w * input.h, 16 / sizeof(float)) * input.c * sizeof(float)); +#else + input.data = (float *)malloc(input.w * input.h * input.c * sizeof(float)); +#endif + if (!input.data) + return 0; + + output.w = (w + pad_l + pad_r - kernel_dilation) / stride + 1; + output.h = (h + pad_t + pad_b - kernel_dilation) / stride + 1; + output.c = outch; + output.n = 1; + +#ifdef NCNN + output.data = (float *)malloc(alignSize(output.w * output.h, 16 / sizeof(float)) * output.c * + sizeof(float)); +#else + output.data = (float *)malloc(output.w * output.h * output.c * sizeof(float)); +#endif + const int gpu_data_off = output.w * output.h * output.c; + if (!output.data) + return 0; + + for (int i = 0; i < output.w * output.h * output.c; i++) + { + output.data[i] = 1.f; + } + + filter.w = kernel_size; + filter.h = kernel_size; + filter.c = 1; + filter.n = outch; + filter.data = (float *)malloc(filter.w * filter.h * filter.c * filter.n * sizeof(float)); + if (!filter.data) + return 0; + + for (int i = 0; i < input.w * input.h * input.c; i++) + { + input.data[i] = 0.001 + i * 0.000001; + } + + for (int i = 0; i < filter.w * filter.h * filter.c * filter.n; i++) + { + filter.data[i] = 0.001 - i * 0.000001; + } + + bias.w = outch; + bias.data = (float *)malloc(bias.w * sizeof(float)); + if (!bias.data) + return 0; + for (int i = 0; i < bias.w; i++) + { + bias.data[i] = 0.f; + } + + params.kernel_w = kernel_size; + params.kernel_h = kernel_size; + params.stride_w = stride; + params.stride_h = stride; + params.padding = padding; + params.pad_w = pad_l; + params.pad_h = pad_t; + params.dilation_w = dilation; + params.dilation_h = dilation; + + const int m = output.c; + const int n = output.w * output.h; + const int k = params.kernel_h * params.kernel_w * input.c; + + // ocl_context_t context; + size_t local_min[2] = {4, 4}; + /** + if(conv_type == 1) + { + if(init_gpu(&context) < 0) return -1; + depthwise_conv_3x3S1_tune(&context, &input, &filter, &output, local_min); + }**/ + + gettimeofday(&start, NULL); + if (conv_type == 0) + srcn_depthwise_conv(input, filter, output, bias, params, 4, + row_major); // convdw3x3s1_neon(input, output, filter, filter); + // else if(conv_type == 1) depthwise_conv_gpu3x3S1(&context, &input, &filter, &output, ¶ms, + // local_min); + else if (conv_type == 2) + { + for (int i = 0; i < input.c; i++) + { + convMat_t _input; + convMat_t _output; + convMat_t _filter; + convParams_t _params = params; + + _input.w = input.w; + _input.h = input.h; + _input.c = 1; + _input.n = 1; +#ifdef NCNN + _input.data = input.data + i * alignSize(input.w * input.h, 16 / sizeof(float)); +#else + _input.data = input.data + i * input.w * input.h; +#endif + + _output.w = output.w; + _output.h = output.h; + _output.c = 1; + _output.n = 1; +#ifdef NCNN + _output.data = output.data + i * alignSize(output.w * output.h, 16 / sizeof(float)); +#else + _output.data = output.data + i * output.w * output.h; +#endif + _filter.w = filter.w; + _filter.h = filter.h; + _filter.c = 1; // filter.c; + _filter.n = 1; // filter.n; + _filter.data = filter.data + i * 9; + + srcn_convolution2D(_input, _filter, _output, _params, NULL, 1, row_major); + // direct_conv_rowmajor(&_input, &_output, &_filter, &_params); + } + } + + gettimeofday(&end, NULL); + total_time += + ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000; + + int div = m * n < 16 ? m * n : 16; + int num = m * n > 64 ? 64 : m * n; + + if (conv_type == 0) + printf("[CPU RESULT]\n"); + else if (conv_type == 1) + printf("[GPU RESULT]\n"); + float *c_ptr = output.data; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + c_ptr = &output.data[m * n - num]; + for (int i = 0; i < num; i++) + { + printf("%f ", c_ptr[i]); + if ((i + 1) % div == 0) + printf("\n"); + } + + printf("\n"); + + long long total_size = (long long)m * n * k * 2; + printf("AVER Time consuming: %.2fms, total size: %lld, (GFLOP: %.2f)\n", total_time / loops, + total_size, (double)total_size / (total_time / loops) / 1000000); + + free(input.data); + free(output.data); + free(filter.data); + free(bias.data); + + return 0; +} + +//#define TEST_SGEMM +#define TEST_CONV +//#define TEST_DECONV +//#define TEST_BATCH_CONV +//#define TEST_DEPTHWISE_CONV + +int main(int argc, char **argv) +{ +#ifdef TEST_SGEMM + if (argc < 6) + return 0; + + const int m = atoi(argv[1]); + const int n = atoi(argv[2]); + const int k = atoi(argv[3]); + const int type = atoi(argv[4]); + const int loops = atoi(argv[5]); + + test_sgemm(m, n, k, type, loops); +#elif (defined TEST_CONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops); +#elif (defined TEST_DECONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_deconv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, loops); +#elif (defined TEST_BATCH_CONV) + if (argc < 11) + return 0; + const int batch = atoi(argv[1]); + const int w = atoi(argv[2]); + const int h = atoi(argv[3]); + const int kernel_size = atoi(argv[4]); + const int stride = atoi(argv[5]); + const int outch = atoi(argv[6]); + const int inch = atoi(argv[7]); + const int padding = atoi(argv[8]); + const int conv_type = atoi(argv[9]); + const int thread_num = atoi(argv[10]); + int loops = 1; + if (argc > 11) + loops = atoi(argv[11]); + test_batch_conv(batch, w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, + loops); +#elif (defined TEST_DEPTHWISE_CONV) + if (argc < 10) + return 0; + const int w = atoi(argv[1]); + const int h = atoi(argv[2]); + const int kernel_size = atoi(argv[3]); + const int stride = atoi(argv[4]); + const int outch = atoi(argv[5]); + const int inch = atoi(argv[6]); + const int padding = atoi(argv[7]); + const int conv_type = atoi(argv[8]); + const int thread_num = atoi(argv[9]); + int loops = 1; + if (argc > 10) + loops = atoi(argv[10]); + test_depthwise_conv(w, h, kernel_size, stride, inch, outch, padding, conv_type, thread_num, + loops); +#endif + + return 0; +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/srcn_conv.cc b/compute/ncnn/src/srcn/srcn_conv.cc new file mode 100644 index 000000000..bb8e4f13e --- /dev/null +++ b/compute/ncnn/src/srcn/srcn_conv.cc @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "ncnn/srcn/conv_type.h" +#include "common.h" +#include "sgemm_singlethread.h" +#include "conv_sgemm_singlethread.h" +#include "conv_sgemm_multithreads.h" +#include "conv_winograd.h" +#include "direct_conv_colmajor.h" +#include "winograd.h" + +#include "deconv_sgemm_multithreads.h" +#include "conv_sparse.h" +#include "conv_winograd_batch.h" + +namespace nnfw +{ +namespace srcn +{ + +static inline void weight_transfer(float *out, float *in, int H, int W, int C, int N) +{ + // HWCN ---> NCHW + for (int h = 0; h < H; ++h) + { + for (int w = 0; w < W; ++w) + { + for (int c = 0; c < C; ++c) + { + for (int n = 0; n < N; ++n) + { + int index_in = h * W * C * N + w * C * N + c * N + n; + int index_out = n * C * H * W + c * H * W + h * W + w; + out[index_out] = in[index_in]; + } + } + } + } +} + +int check_winograd(winogradParams_t ¶ms) +{ + int winograd_flag = + ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) && + (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) && + (params.dilation_w == 1) && (params.dilation_h == 1)); + + int winograd_channel_cond = 64 * 64; + int winograd_image_cond = 10 * 10; + +#ifdef TIZEN + if (params.num_threads > 1) + { + winograd_channel_cond = 128 * 128; + winograd_image_cond = 20 * 20; + } +#endif // TIZEN + + winograd_flag &= (params.inch * params.outch >= winograd_channel_cond); + + if (params.w > 0 && params.h > 0 && params.batch == 1) + { + winograd_flag &= (params.w * params.h >= winograd_image_cond); + } + + return winograd_flag; +} + +float *trans_weight2winograd(winogradParams_t ¶ms, unsigned int *size = NULL) +{ + int M, N; + const double *G; + + float *winograd_weight; + + int winograd_channel_cond = 64 * 64; + int winograd_image_cond = 10 * 10; + +#ifdef TIZEN + if (params.num_threads > 1) + { + winograd_channel_cond = 128 * 128; + // int winograd_image_cond = 20 * 20; + } +#endif // TIZEN + + int winograd_flag = + ((params.kernel_w == params.kernel_h) && (params.stride_w == params.stride_h) && + (params.kernel_w == 3 || params.kernel_w == 5) && (params.stride_w == 1) && + (params.dilation_w == 1) && (params.dilation_h == 1)); + if (!winograd_flag) + return NULL; + + winograd_flag = (params.inch * params.outch >= winograd_channel_cond); + + if (!winograd_flag) + return NULL; + + if (params.w > 0 && params.h > 0 && params.batch == 1) + { + winograd_flag &= (params.w * params.h >= winograd_image_cond); + if (!winograd_flag) + return NULL; + } + + const int kernel_size = params.kernel_w; + const int inch = params.inch; + const int outch = params.outch; + float *weight_data = params.weight_data; + + /*Step 1: transfer weight to winograd domain*/ + if (kernel_size == 3) + { + if (params.w == 4 && params.batch > 1) + { + M = winograd_para_3x3s1_2::M; + N = winograd_para_3x3s1_2::N; + G = winograd_para_3x3s1_2::getG(); + } + else + { + M = winograd_para_3x3s1::M; + N = winograd_para_3x3s1::N; + G = winograd_para_3x3s1::getG(); + } + } + else + { + M = winograd_para_5x5s1::M; + N = winograd_para_5x5s1::N; + G = winograd_para_5x5s1::getG(); + } + + int tile_h_in_, tile_w_in_; + tile_h_in_ = tile_w_in_ = M; + + if (size) + *size = tile_h_in_ * tile_w_in_ * inch * outch; + + winograd_weight = new float[tile_h_in_ * tile_w_in_ * inch * outch]; + if (!winograd_weight) + return NULL; + + float *winograd_g = new float[M * M * N * N]; + if (!winograd_g) + { + delete[] winograd_weight; + return NULL; + } + + kronecker_product(winograd_g, G, G, M, N, M, N); + + if (params.conv_type == col_major) + { + weight_data = new float[kernel_size * kernel_size * inch * outch]; + if (!weight_data) + { + delete[] winograd_weight; + delete[] winograd_g; + return NULL; + } + weight_transfer(weight_data, params.weight_data, kernel_size, kernel_size, inch, outch); + } + + class sgemm_singlethread sgemm(rowMajor, notrans, trans, tile_h_in_ * tile_w_in_, inch * outch, + kernel_size * kernel_size, winograd_g, weight_data, + winograd_weight, 1); + + sgemm.run(); + + if (params.conv_type == col_major) + delete[] weight_data; + + delete[] winograd_g; + + return winograd_weight; +} + +void winograd_release(float *winograd_weight) +{ + if (winograd_weight) + delete[] winograd_weight; +} + +void srcn_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, const float *winograd_weight, int num_threads, + convType_t conv_type) +{ + const int outw = out_mat.w; + const int outh = out_mat.h; + const int inch = in_mat.c; + const int outch = out_mat.c; + + int winograd_flag = + ((in_param.kernel_w == in_param.kernel_h) && (in_param.stride_w == in_param.stride_h) && + (in_param.kernel_w == 3 || in_param.kernel_w == 5) && (in_param.stride_w == 1) && + (winograd_weight) && (in_param.dilation_w == 1) && (in_param.dilation_h == 1)); + + int direct_flag = ((conv_type == col_major) && (in_param.stride_w == in_param.stride_h) && + (in_param.dilation_w == 1) && (in_param.dilation_h == 1)); + + int winograd_image_cond = 10 * 10; + int winograd_channel_cond = 64 * 64; + int direct_image_cond = 4 * 4; + int direct_channel_cond = 16 * 16; + +#ifdef TIZEN + if (num_threads > 1) + { + winograd_image_cond = 20 * 20; + winograd_channel_cond = 128 * 128; + } +#endif + + winograd_flag &= + ((outw * outh >= winograd_image_cond) && (inch * outch >= winograd_channel_cond)); + direct_flag &= ((outw * outh <= direct_image_cond) || (inch * outch <= direct_channel_cond)); + + if (num_threads == 1) + { + if (winograd_flag) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, num_threads, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + else if (direct_flag) + { + direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads); + } + else + { + class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type); + conv.run(); + } + } + else if (num_threads > 1) + { + if (winograd_flag) + { + const int npart = num_threads > 4 ? 4 : num_threads; + + omp_set_num_threads(npart); + + if (conv_type == col_major) + { + if (outch < 512) + { + const int _H = (outh + npart - 1) / npart; + + if (_H < in_param.pad_h) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + return; + } + + // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w; + // const int oh = _H; + const int nh = (outh + _H - 1) / _H; + int rh = outh % _H; + if (rh == 0) + rh = _H; + +#pragma omp parallel for + for (int i = 0; i < nh; i++) + { + int pad_h_part = 0; + convMat_t in_part; + convMat_t out_part; + const int oh = (i != nh - 1 || rh == 0) ? _H : rh; + const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w; + + in_part.w = in_mat.w; + in_part.c = inch; + out_part.w = outw; + out_part.c = outch; + in_part.h = ih; + out_part.h = oh; + + int bottom_offset = i * _H - in_param.pad_h; + if (bottom_offset < 0) + { + bottom_offset = 0; + pad_h_part = in_param.pad_h; + } + in_part.data = in_mat.data + bottom_offset * in_mat.w * inch * in_param.stride_w; + if (ih + bottom_offset > in_mat.h) + { + in_part.h = in_mat.h - bottom_offset; + } + + out_part.data = out_mat.data + i * _H * outw * outch; + + convParams_t params = { + in_param.kernel_w, in_param.kernel_h, in_param.stride_w, in_param.stride_h, 1, 1, + in_param.padding, in_param.pad_w, pad_h_part}; + + class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + else + { + const int _OUTC = (outch + npart - 1) / npart; + + const int nc = (outch + _OUTC - 1) / _OUTC; + int rc = out_mat.c % _OUTC; + if (rc == 0) + rc = _OUTC; + +#pragma omp parallel for + for (int i = 0; i < nc; i++) + { + const float *weight_part; + convMat_t out_part; + + const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc; + + out_part.w = outw; + out_part.h = outh; + out_part.c = oc; + out_part.data = out_mat.data + i * _OUTC; + weight_part = winograd_weight + i * _OUTC * inch; + class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + } + else if (conv_type == row_major) + { +#ifdef TIZEN + if (outch < 512) +#else // TIZEN + if (outh >= 20) +#endif // TIZEN + { + const int _H = (outh + npart - 1) / npart; + + if (_H < in_param.pad_h) + { + class conv_winograd conv(in_mat, out_mat, in_param, conv_type, winograd_weight, 1, + in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + return; + } + + // const int ih = (_H - 1) * in_param.stride_w + in_param.kernel_w; + // const int oh = _H; + const int nh = (outh + _H - 1) / _H; + int rh = outh % _H; + if (rh == 0) + rh = _H; + +#pragma omp parallel for + for (int i = 0; i < nh; i++) + { + int pad_h_part = 0; + convMat_t in_part; + convMat_t out_part; + const int oh = (i != nh - 1 || rh == 0) ? _H : rh; + const int ih = (oh - 1) * in_param.stride_w + in_param.kernel_w; + + in_part.w = in_mat.w; + in_part.c = inch; + out_part.w = outw; + out_part.c = outch; + in_part.h = ih; + out_part.h = oh; + + int bottom_offset = i * _H - in_param.pad_h; + if (bottom_offset < 0) + { + bottom_offset = 0; + pad_h_part = in_param.pad_h; + } + in_part.data = in_mat.data + bottom_offset * in_mat.w * in_param.stride_w; + if (ih + bottom_offset > in_mat.h) + { + in_part.h = in_mat.h - bottom_offset; + } + + out_part.data = out_mat.data + i * _H * outw; + + convParams_t params = { + in_param.kernel_w, in_param.kernel_h, in_param.stride_w, 1, 1, + in_param.stride_h, in_param.padding, in_param.pad_w, pad_h_part}; + + class conv_winograd conv(in_part, out_part, params, conv_type, winograd_weight, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + else + { + const int _OUTC = (outch + npart - 1) / npart; + + const int nc = (outch + _OUTC - 1) / _OUTC; + int rc = out_mat.c % _OUTC; + if (rc == 0) + rc = _OUTC; + +#pragma omp parallel for + for (int i = 0; i < nc; i++) + { + const float *weight_part; + convMat_t out_part; + + const int oc = (i != nc - 1 || rc == 0) ? _OUTC : rc; + + out_part.w = outw; + out_part.h = outh; + out_part.c = oc; + out_part.data = out_mat.data + i * _OUTC * outw * outh; + weight_part = winograd_weight + i * _OUTC * inch; + class conv_winograd conv(in_mat, out_part, in_param, conv_type, weight_part, + num_threads, in_mat.w * in_mat.h, outw * outh, outch); + conv.run(); + } + } + } + } + else if (direct_flag) + { + direct_conv_colmajor(in_mat, out_mat, weights_mat, in_param, num_threads); + } + else + { + class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + conv.run(); + } + } +} + +void srcn_deconvolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, convMat_t &out_mat, + const convParams_t &in_param, int num_threads, convType_t conv_type) +{ + class deconv_sgemm_multithreads deconv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + deconv.run(); +} + +void *trans_weight2sparse(const convMat_t &weights_mat) +{ + const int kernel_w = weights_mat.w; + const int kernel_h = weights_mat.h; + const int inch = weights_mat.c; + const int outch = weights_mat.n; + + const int nch = (outch + BCH - 1) / BCH; + const int rch = outch % BCH; + + const float *data = weights_mat.data; + const int klength = inch * kernel_h * kernel_w; + + sparse_weight_t *sparse_weight = new sparse_weight_t[nch]; + if (!sparse_weight) + return NULL; + + for (int i = 0; i < nch; i++) + { + int _bch = (i != nch - 1 || rch == 0) ? BCH : rch; + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + sparse_weight_n->mxk = 0; + + for (int j = 0; j < _bch; j++) + { + for (int l = 0; l < klength; l++) + { + float val = *(data + (i * BCH + j) * klength + l); + if (val != 0) + { + sparse_weight_n->mxk++; + } + } + } + } + + for (int i = 0; i < nch; i++) + { + int _bch = (i != nch - 1 || rch == 0) ? BCH : rch; + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + sparse_weight_n->wdata = new weight_data_t[sparse_weight_n->mxk]; + int index = 0; + + for (int l = 0; l < klength; l++) + { + for (int j = 0; j < _bch; j++) + { + float val = *(data + (i * BCH + j) * klength + l); + if (val != 0) + { + sparse_weight_n->wdata[index].m = i * BCH + j; + sparse_weight_n->wdata[index].k = l; + sparse_weight_n->wdata[index++].data = val; + } + } + } + } + + return (void *)sparse_weight; +} + +void sparse_release(const int outch, void *ptr) +{ + sparse_weight_t *sparse_weight = (sparse_weight_t *)ptr; + const int nch = (outch + BCH - 1) / BCH; + + if (!sparse_weight) + return; + + for (int i = 0; i < nch; i++) + { + sparse_weight_t *sparse_weight_n = &sparse_weight[i]; + if (sparse_weight_n->wdata) + delete[] sparse_weight_n->wdata; + } + + if (sparse_weight) + delete[] sparse_weight; +} + +void srcn_sparse_convolution2D(const convMat_t &in_mat, convMat_t &out_mat, + const convParams_t &in_param, const void *sparse_weight, + int number_threas, convType_t conv_type) +{ + class conv_sparse conv(in_mat, out_mat, in_param, (const sparse_weight_t *)sparse_weight, + number_threas, conv_type); + + for (int i = 0; i < out_mat.c * out_mat.h * out_mat.w; i++) + { + *(out_mat.data + i) = 0; + } + + conv.run(); +} + +void srcn_batch_convolution2D(const convMat_t &in_mat, const convMat_t &weights_mat, + convMat_t &out_mat, const convParams_t &in_param, + const float *winograd_weight, int num_threads, convType_t conv_type) +{ + int winograd_flag = (winograd_weight != NULL); + + if (winograd_flag) + { + if (num_threads > 1) + { + omp_set_num_threads(num_threads); + const int batch = in_mat.n; + const int npart = (batch + num_threads - 1) / num_threads; + const int nn = (batch + npart - 1) / npart; + const int rn = batch % npart; + +#pragma omp parallel for + for (int i = 0; i < nn; i++) + { + const int pn = (i != nn - 1 || rn == 0) ? npart : rn; + convMat_t in_mat_part = {in_mat.w, in_mat.h, in_mat.c, pn, + in_mat.data + i * npart * in_mat.w * in_mat.h * in_mat.c}; + convMat_t out_mat_part = {out_mat.w, out_mat.h, out_mat.c, pn, + out_mat.data + i * npart * out_mat.w * out_mat.h * out_mat.c}; + + class conv_winograd_batch conv(in_mat_part, out_mat_part, in_param, conv_type, + winograd_weight, num_threads); + conv.run(); + } + } + else + { + class conv_winograd_batch conv(in_mat, out_mat, in_param, conv_type, winograd_weight, + num_threads); + conv.run(); + } + } + else + { + if (num_threads == 1) + { + class conv_sgemm_singlethread conv(in_mat, weights_mat, out_mat, in_param, conv_type); + conv.run(); + } + else + { + class conv_sgemm_multithreads conv(in_mat, weights_mat, out_mat, in_param, num_threads, + conv_type); + conv.run(); + } + } +} + +} // namespace srcn +} // namespace nnfw diff --git a/compute/ncnn/src/srcn/winograd.h b/compute/ncnn/src/srcn/winograd.h new file mode 100644 index 000000000..5ad8f1126 --- /dev/null +++ b/compute/ncnn/src/srcn/winograd.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_SRCN_WINOGRAD_H__ +#define __NNFW_SRCN_WINOGRAD_H__ + +namespace nnfw +{ +namespace srcn +{ + +struct winograd_para_3x3s1 +{ + static const int M = 3 + 4 - 1; + static const int N = 3; + + static const double *getG() + { + static const double G[M * N] = { + 1. / 4., 0, 0, -1. / 6., -1. / 6., -1. / 6., -1. / 6., 1. / 6., -1. / 6., + 1. / 24., 1. / 12., 1. / 6., 1. / 24., -1. / 12., 1. / 6., 0, 0, 1, + }; + return G; + } + + static const double *getA() + { + static const double A[M * (M - N + 1)] = { + 1, 0, 0, 0, 1, 1, 1, 1, 1, -1, 1, -1, 1, 2, 4, 8, 1, -2, 4, -8, 0, 0, 0, 1, + }; + return A; + } + + static const double *getB() + { + static const double B[M * M] = { + 4, 0, 0, 0, 0, 0, 0, -4, 4, -2, 2, 4, -5, -4, -4, -1, -1, 0, + 0, 1, -1, 2, -2, -5, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, + }; + return B; + }; +}; + +struct winograd_para_3x3s1_2 +{ + static const int M = 3 + 2 - 1; + static const int N = 3; + + static const double *getG() + { + static const double G[M * N] = { + 1, 0, 0, 1. / 2., 1. / 2., 1. / 2., 1. / 2., -1. / 2., 1. / 2., 0, 0, 1, + }; + return G; + } + + static const double *getA() + { + static const double A[M * (M - N + 1)] = { + 1, 0, 1, 1, 1, -1, 0, 1, + }; + return A; + } + + static const double *getB() + { + static const double B[M * M] = { + 1, 0, 0, 0, 0, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1, + }; + return B; + }; +}; + +struct winograd_para_5x5s1 +{ + static const int M = 5 + 4 - 1; + static const int N = 5; + + static const double *getG() + { + static const double G[M * N] = { + 1, 0, 0, 0, 0, -2. / 9., -2. / 9., -2. / 9., + -2. / 9., -2. / 9., -2. / 9., 2. / 9., -2. / 9., 2. / 9., -2. / 9., 1. / 90., + 1. / 45., 2. / 45., 4. / 45., 8. / 45., 1. / 90., -1. / 45., 2. / 45., -4. / 45., + 8. / 45., 4. / 45., 2. / 45., 1. / 45., 1. / 90., 1. / 180., 4. / 45., -2. / 45., + 1. / 45., -1. / 90., 1. / 180., 0, 0, 0, 0, 1, + }; + return G; + } + + static const double *getA() + { + static const double A[M * (M - N + 1)] = {1, 0, 0, 0, 1, 1, 1, 1, 1, -1, 1, -1, 1, 2, 4, 8, + 1, -2, 4, -8, 8, 4, 2, 1, 8, -4, 2, -1, 0, 0, 0, 1}; + return A; + } + + static const double *getB() + { + static const double B[M * M] = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, + -1, 1. / 2, -1. / 2, 2, -2, -1, -21. / 4, 1, 1, 1. / 4, + 1. / 4, 4, 4, 0, 0, -17. / 4, 17. / 4, -5. / 2, 5. / 2, -5. / 2, + 5. / 2, 21. / 4, 21. / 4, -17. / 4, -17. / 4, -5. / 4, -5. / 4, -5, -5, 0, + 0, 1, -1, 2, -2, 1. / 2, -1. / 2, -21. / 4, -1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 1, + }; + return B; + } +}; + +static void kronecker_product(float *out, const double *in1, const double *in2, int m, int n, int p, + int q) +{ + for (int i = 0; i < m; ++i) + { + for (int j = 0; j < n; ++j) + { + for (int k = 0; k < p; ++k) + { + for (int l = 0; l < q; ++l) + { + out[(p * i + k) * n * q + q * j + l] = in1[n * i + j] * in2[k * q + l]; + /* compute in double precision and then convert it back to Dtype for accuracy */ + } + } + } + } +} + +} // namespace srcn +} // namespace nnfw + +#endif // __NNFW_SRCN_WINOGRAD_H__ |