diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:04 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:04 +0900 |
commit | 12d88feea8573f8490629cf62fc342b152e57d65 (patch) | |
tree | 3c734cc4d629834d2d523f4575ef84cd64684e57 /compute | |
parent | d6b371e095d737922187a518b8faba1ef6f3a2b1 (diff) | |
download | nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.gz nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.bz2 nnfw-12d88feea8573f8490629cf62fc342b152e57d65.zip |
Imported Upstream version 1.11.0upstream/1.11.0
Diffstat (limited to 'compute')
236 files changed, 43514 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt new file mode 100644 index 000000000..58f558db2 --- /dev/null +++ b/compute/ARMComputeEx/CMakeLists.txt @@ -0,0 +1,36 @@ +nnfw_find_package(ARMCompute QUIET) + +if(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: need ARM Compute library") + return() +else(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: OK") +endif(NOT ARMCompute_FOUND) + +set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR}) + +file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") + +# generate embeded cl_kernel +execute_process ( + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMAND bash -c "python resolve_includes.py" +) + +add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) +target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE}) +target_link_libraries(arm_compute_ex PRIVATE arm_compute) +target_link_libraries(arm_compute_ex PRIVATE nnfw_common) +target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage) +# Defines to enable validate check in debug build +target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS + $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED + ARM_COMPUTE_LOGGING_ENABLED>) +# Validate check functions are not used on release build +# Some parameter are used for validate check function call, and these parameter may not used on release build +# Because clang requires to add "-Wno-unused-parameter -Wno-unused-function" after "-Wall", +# this should be after linking nnfw_common and use interface lib linking +add_library(ignore_unused_warning INTERFACE) +target_compile_options(ignore_unused_warning INTERFACE -Wno-unused-parameter -Wno-unused-function) +target_link_libraries(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:ignore_unused_warning>) +install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h new file mode 100644 index 000000000..d29886a9d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + +#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ +#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <map> +#include <set> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ +class CLKernelLibraryEx +{ + using StringSet = std::set<std::string>; + +private: + /** + * @brief Construct a new CLKernelLibraryEx object + */ + CLKernelLibraryEx(); + +public: + /** + * @brief Prevent instances of this class from being copied. + */ + CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; + + /** + * @brief Prevent instances of this class from being copied. + */ + const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance + */ + static CLKernelLibraryEx &get(); + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) + { + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); + } + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A + */ + void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files + */ + std::string get_kernel_path() { return _kernel_path; }; + + /** + * @brief Get the source of the selected program. + * @param[in] program_name Program name. + * @return Source of the selected program. + */ + std::string get_program_source(const std::string &program_name); + + /** + * @brief Set the CL context used to create programs. + * @note Setting the context also resets the device to the + * first one available in the new context. + * @param[in] context A CL context. + * @return N/A + */ + void set_context(cl::Context context) + { + _context = std::move(context); + if (_context.get() == nullptr) + { + _device = cl::Device(); + } + else + { + const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); + + if (cl_devices.empty()) + { + _device = cl::Device(); + } + else + { + _device = cl_devices[0]; + } + } + } + + /** + * @brief Return associated CL context. + * @return A CL context. + */ + cl::Context &context() { return _context; } + + /** + * @brief Set the CL device for which the programs are created. + * @param[in] device A CL device. + * @return N/A + */ + void set_device(cl::Device device) { _device = std::move(device); } + + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version + * @return The content of CL_DEVICE_VERSION + */ + std::string get_device_version(); + + /** + * @brief Create a kernel from the kernel library. + * @param[in] kernel_name Kernel name. + * @param[in] build_options_set Kernel build options as a set. + * @return The created kernel. + */ + Kernel create_kernel(const std::string &kernel_name, + const StringSet &build_options_set = {}) const; + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object + */ + + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device + */ + cl::NDRange default_ndrange() const; + + /** + * @brief Clear the library's cache of binary programs + * @return N/A + */ + void clear_programs_cache() + { + _programs_map.clear(); + _built_programs_map.clear(); + } + + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ + const std::map<std::string, cl::Program> &get_built_programs() const + { + return _built_programs_map; + } + + /** + * @brief Add a new built program to the cache + * @param[in] built_program_name Name of the program + * @param[in] program Built program to add to the cache + * @return N/A + */ + void add_built_program(const std::string &built_program_name, cl::Program program); + + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + +private: + /** + * @brief Load program and its dependencies. + * @param[in] program_name Name of the program to load. + */ + const Program &load_program(const std::string &program_name) const; + /** + * @brief Concatenates contents of a set into a single string. + * @param[in] s Input set to concatenate. + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + + cl::Context _context; /**< Underlying CL context. */ + cl::Device _device; /**< Underlying CL device. */ + std::string _kernel_path; /**< Path to the kernels folder. */ + mutable std::map<std::string, const Program> + _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, cl::Program> + _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} +#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h new file mode 100644 index 000000000..a0aa0560b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H +#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the reduction operation kernel + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerKernelEx : public ICLKernel +{ +public: + /** Default constructor */ + CLArgMinMaxLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default; + /** Default destructor */ + ~CLArgMinMaxLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor of the previous iterations of @ref + * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[out] output Destination tensor. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + */ + void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, + unsigned int axis, ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerKernelEx. + * + * @param[in] input Source tensor info. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor info of the previous iterations. Data types + * supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[in] output Destination tensor info. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_prev_output; + ICLTensor *_output; + unsigned int _reduction_axis; + ReductionOperation _op; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..bb6fcb8f5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h new file mode 100644 index 000000000..ed668fd9c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLCastBoolKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastBoolKernel class + */ + +#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ + +#include "arm_compute/core/CL/ICLSimple3DKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class for the kernel converting boolean type + */ +class CLCastBoolKernel : public ICLSimple3DKernel +{ +public: + /** + * @brief Initialise the kernel's input and output. + * @param[in] input Input tensor. Data types supported: U8 + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLCastBoolKernel + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..a614d5259 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h new file mode 100644 index 000000000..6630c7be7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLGatherExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherExKernel class + */ + +#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ +#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define an interface for the gather kernel. + */ +class CLGatherExKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLGatherExKernel object + * */ + CLGatherExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel(const CLGatherExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel &operator=(const CLGatherExKernel &) = delete; + + /** + * @brief Construct CLGatherExKernel object by using default move constructor + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel(CLGatherExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel &operator=(CLGatherExKernel &&) = default; + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLGatherExKernel + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_indices; + ICLTensor *_output; + int _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..99cfa61ec --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups{nullptr}; /** Lookups tensor */ + const ICLTensor *_keys{nullptr}; /** Keys tensor */ + const ICLTensor *_input{nullptr}; /** Source tensor */ + ICLTensor *_output{nullptr}; /** Destination tensor */ + ICLTensor *_hits{nullptr}; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..f57e799ad --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for performing an instance normalization */ +class CLInstanceNormalizationLayerKernelEx : public ICLKernel +{ +public: + /** Constructor */ + CLInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx & + operator=(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Default Move Constructor. */ + CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default move assignment operator */ + CLInstanceNormalizationLayerKernelEx & + operator=(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~CLInstanceNormalizationLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_gamma; + ICLTensor *_beta; + float _epsilon; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h new file mode 100644 index 000000000..90e8b5705 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class CLMultiplyScaleFactorKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMultiplyScaleFactorKernel(const CLMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMultiplyScaleFactorKernel &operator=(const CLMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + CLMultiplyScaleFactorKernel(CLMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + CLMultiplyScaleFactorKernel &operator=(CLMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~CLMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + * @param[in] multiplier Additional scale value. + */ + void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @param[in] multiplier Additional scale value. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_scale_factor; + ICLTensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..fa383c0d0 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h new file mode 100644 index 000000000..a512057b9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#define __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +class ICLTensor; +/** Interface for the kernel to perform one-hot encoding*/ +class CLOneHotKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel(const CLOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel &operator=(const CLOneHotKernel &) = delete; + /** Allow instances of this class to be moved */ + CLOneHotKernel(CLOneHotKernel &&) = default; + /** Allow instances of this class to be moved */ + CLOneHotKernel &operator=(CLOneHotKernel &&) = default; + /** Default destructor */ + ~CLOneHotKernel() = default; + /** Initialise the kernel's inputs and output + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and output already initialized to off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel without off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + /** Initialise the kernel's inputs and outputs internally + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + int depth, int axis); + +private: + const ICLTensor *_indices; /**< Indices tensor */ + const ICLTensor *_on_value; /**< On value tensor */ + const ICLTensor *_off_value; /**< Off value tensor */ + ICLTensor *_output; /**< Destination tensor */ + bool _is_off_value_memset; /**< Whether off_value is zero */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h new file mode 100644 index 000000000..4e1b56cba --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the quantization layer kernel. + * + * @note The implementation supports only 2D input tensors. + */ +class CLQuantizationSymmetricKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQuantizationSymmetricKernel(const CLQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQuantizationSymmetricKernel &operator=(const CLQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + CLQuantizationSymmetricKernel(CLQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + CLQuantizationSymmetricKernel &operator=(CLQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~CLQuantizationSymmetricKernel() = default; + /** Set the input, output. + * + * @param[in] input Source tensor. Data types supported: F32/F16. + * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: + * S8. + * + * @note Output auto initialization is not supported by this kernel + */ + void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F32/F16. + * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @param[in] output Destination tensor info with the same dimensions of input. Data types + * supported: S8. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_scale_factor; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..4f9042e41 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReductionOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReductionOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h new file mode 100644 index 000000000..4d4478ece --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ +#define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the kernel to perform min max search on a 3D tensor. + */ +class CLScaleFactorSymm8Kernel : public ICLKernel +{ +public: + /** Default constructor */ + CLScaleFactorSymm8Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLScaleFactorSymm8Kernel(const CLScaleFactorSymm8Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLScaleFactorSymm8Kernel &operator=(const CLScaleFactorSymm8Kernel &) = delete; + /** Allow instances of this class to be moved */ + CLScaleFactorSymm8Kernel(CLScaleFactorSymm8Kernel &&) = default; + /** Allow instances of this class to be moved */ + CLScaleFactorSymm8Kernel &operator=(CLScaleFactorSymm8Kernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor with 2 dimensions. The first dimension will be interpreted as + * batches. Data types supported: F32. + * @param[out] output Output tensor with shape [batches] which stores the scale values for each 2D + * input tensor. + * The dimensions over the first must match the batched dimensions of the input + * tensor. Data types supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLScaleFactorSymm8Kernel + * + * @param[in] input Input tensor info. Data types supported: F32. + * @param[in] output Output tensor info with shape [batches] which stores the scale values for + * each 2D input tensor. + * The dimensions over the first must match the batched dimensions of the input + * tensor. Data types supported: F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + /** Resets global minimum and maximum + * + * @param[in,out] queue Command queue on which to map and unmap the min_max tensor + */ + void reset(cl::CommandQueue &queue); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h new file mode 100644 index 000000000..aa4a14812 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +// these parameters can be changed +#define _ITEMS 16 // number of items in a group +#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS +#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram +#define PERMUT // store the final permutation +//////////////////////////////////////////////////////// + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define CLTopKV2Single + */ +class CLTopKV2Single : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Single(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ + CLTopKV2Single(const CLTopKV2Single &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ + CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ + CLTopKV2Single(CLTopKV2Single &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ + CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); + + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_topk_values; + ICLTensor *_topk_indices; +}; + +/** + * @brief Class to define CLTopKV2Init + */ +class CLTopKV2Init : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Init(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ + CLTopKV2Init(const CLTopKV2Init &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ + CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ + CLTopKV2Init(CLTopKV2Init &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ + CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); + + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; +}; + +/** + * @brief Class to define CLRadixSortHistogram + */ +class CLRadixSortHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ + CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ + CLRadixSortHistogram(CLRadixSortHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + } + + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; +}; + +/** + * @brief Class to define CLRadixSortScanHistogram + */ +class CLRadixSortScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ + CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ + CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ +class CLRadixSortGlobalScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortGlobalScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ + CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ + CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); + + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortPasteHistogram + */ +class CLRadixSortPasteHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortPasteHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ + CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ + CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortReorder + */ +class CLRadixSortReorder : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortReorder(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ + CLRadixSortReorder(const CLRadixSortReorder &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ + CLRadixSortReorder(CLRadixSortReorder &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ +class CLTopKV2FindFirstNegative : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2FindFirstNegative(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ + CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ + CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ + void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } + + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_out_key_buf; +}; + +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ +class CLTopKV2ReorderNegatives : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2ReorderNegatives(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ + CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ + CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2Store + */ +class CLTopKV2Store : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Store(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ + CLTopKV2Store(const CLTopKV2Store &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ + CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ + CLTopKV2Store(CLTopKV2Store &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ + CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ + void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); + + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_values; + ICLTensor *_indices; + cl::Buffer *_out_key_buf; + cl::Buffer *_out_ind_buf; +}; + +} // namespace arm_compute +#endif // Disable GPU implementation +#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h new file mode 100644 index 000000000..933d8760d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ +#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +class ITensor; +class Window; +class QuantizationInfo; +} // namespace arm_compute + +namespace arm_compute +{ + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)); +} // namespace arm_compute +#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h new file mode 100644 index 000000000..8c544cda8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ + +class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel +{ +public: + /** Default destructor */ + ~NEBinaryLogicalOperationKernel() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[in] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a Status + */ + static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1, + const ITensorInfo *input2, const ITensorInfo *output); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + const ITensorInfo &output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h new file mode 100644 index 000000000..101f6ac8e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class for the kernel converting boolean type + */ +class NECastBoolKernel : public INEKernel +{ +public: + const char *name() const override { return "NECastBoolKernel"; } + /** Default constructor*/ + NECastBoolKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel(const NECastBoolKernel &) = delete; + /** Default move constructor */ + NECastBoolKernel(NECastBoolKernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel &operator=(const NECastBoolKernel &) = delete; + /** Default move assignment operator */ + NECastBoolKernel &operator=(NECastBoolKernel &&) = default; + /** Set the input and output of the kernel + * + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NECastBoolKernel + * + * @param[in] input Source tensor info. Data types supported: U8 + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h new file mode 100644 index 000000000..88f21c96e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform EmbeddingLookup operation */ +class NEEmbeddingLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEEmbeddingLookupKernel"; } + /** Default constructor */ + NEEmbeddingLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups are 1D tensor that values are indices into the first dimension of + * input. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEEmbeddingLookupKernel + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + const ITensor *_lookups; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h new file mode 100644 index 000000000..5acfde5a8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ +#define __ARM_COMPUTE_NEGATHERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform other operation on NEON */ +class NEGatherKernelEx : public INEKernel +{ +public: + /** Default constructor. */ + NEGatherKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx(const NEGatherKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx(NEGatherKernelEx &&) = default; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default; + /** Default detructor */ + ~NEGatherKernelEx() = default; + + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEGatherKernelEx"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the gather operation for 0 axis. + * + * For gather on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info); + + /** Implementation of the gather operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info); + + using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info); + + const ITensor *_input; + const ITensor *_indices; + int _axis; + size_t _indices_rank; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h new file mode 100644 index 000000000..cb2a485d5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HashtableLookup operation */ +class NEHashtableLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEHashtableLookupKernel"; } + /** Default constructor */ + NEHashtableLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default; + /** Initialize the kernel's inputs, outputs. + * + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * input. + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEHashtableLookupKernel + * + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_lookups; /** Lookups tensor */ + const ITensor *_keys; /** Keys tensor */ + const ITensor *_input; /** Source tensor */ + ITensor *_output; /** Destination tensor */ + ITensor *_hits; /** Hits tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..8724cc69b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for performing an instance normalization */ +class NEInstanceNormalizationLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; } + /** Default constructor */ + NEInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx & + operator=(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx & + operator=(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~NEInstanceNormalizationLayerKernelEx() = default; + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * In case of @p output tensor = nullptr this tensor will store the result + * of the normalization. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Common signature for all the specialized instance normalization functions + * + * @param[in, out] input An input tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * @param[out] output The output tensor. + * @param[in] gamma The scale scalar value applied to the normalized tensor. Defaults to + * 1.0 + * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to + * 0.0 + * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 + */ + using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window); + + NormalizationFunction *_func; + ITensor *_input; + ITensor *_output; + ITensor *_gamma; + ITensor *_beta; + float _epsilon; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h new file mode 100644 index 000000000..198b0be9d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class NEMultiplyScaleFactorKernel : public INEKernel +{ +public: + const char *name() const override { return "NEMultiplyScaleFactorKernel"; } + /** Default constructor */ + NEMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~NEMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + */ + void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier = 1.f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void multiply(const Window &window); + +private: + const ITensor *_input; + const ITensor *_scale_factor; + ITensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h new file mode 100644 index 000000000..99bb351bc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#define __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Kernel to perform other operation on NEON */ +class NEOneHotKernel : public INEKernel +{ +public: + /** Default constructor. */ + NEOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel(const NEOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel &operator=(const NEOneHotKernel &) = delete; + /** Allow instances of this class to be moved. */ + NEOneHotKernel(NEOneHotKernel &&) = default; + /** Allow instances of this class to be moved. */ + NEOneHotKernel &operator=(NEOneHotKernel &&) = default; + /** Default detructor */ + ~NEOneHotKernel() = default; + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEOneHotKernel"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up to + * 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same + * as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: + * up to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the onehot operation for 0 axis. + * + * For onehot on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info); + /** Implementation of the onehot operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info); + using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info); + const ITensor *_indices; + const ITensor *_depth; + const ITensor *_on_value; + const ITensor *_off_value; + int _axis; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h new file mode 100644 index 000000000..0b080cf73 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the dequantization layer kernel. */ +class NEQuantizationSymmetricKernel : public INEKernel +{ +public: + const char *name() const override { return "NEQuantizationSymmetricKernel"; } + /** Default constructor */ + NEQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~NEQuantizationSymmetricKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output, ITensor *scale_factor); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void quantize(const Window &window); + +private: + const ITensor *_input; + ITensor *_output; + ITensor *_scale_factor; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..cda8a30b1 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperationEx +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +enum class ElementWiseUnaryEx +{ + NEG, /**< NEG */ +}; + +enum class SubDataType +{ + NONE, + BOOL, +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..d57e8fcf5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include <utility> + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ + +/** Returns expected width and height of the transpose convolution's output tensor. + * + * @note This function was copied in order to fix a bug computing to wrong output dimensions. + * + * @param[in] in_width Width of input tensor (Number of columns) + * @param[in] in_height Height of input tensor (Number of rows) + * @param[in] kernel_width Kernel width. + * @param[in] kernel_height Kernel height. + * @param[in] info padding and stride info. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_top The number of zeros added to bottom edge of the output. + * + * @return A pair with the new width in the first position and the new height in the second. + */ +const std::pair<unsigned int, unsigned int> +transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_top); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h new file mode 100644 index 000000000..1e69f0912 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ +#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Utils.h" + +#include "arm_compute/core/utils/helpers/tensor_transform.h" + +#include <cmath> + +namespace arm_compute +{ +namespace misc +{ +namespace shape_calculator +{ + +/** Calculate the upsampled output shape used for transpose convolution + * + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * @param[in] info Padding and stride info + * @param[in] out_dims Output shape dimensions + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[out] pad_left Padding on left + * @param[out] pad_right Padding on right + * @param[out] pad_top Padding on top + * @param[out] pad_bottom Padding on bottom + * + * @return the calculated shape + */ +inline TensorShape compute_transposeconv_upsampled_shape( + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) +{ + unsigned int sx = info.stride().first; + unsigned int sy = info.stride().second; + const DataLayout data_layout = input.data_layout(); + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Find the upsampled dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + // upsample out: + // upsample_out = 1 + (in - 1) * stride + unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1; + unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1; + + // Find the padding needed for the convolution with stride 1 in order to match output shape + // upsample+pad out: + // upsample_out + pad = tconv_out + kernel - 1 + // pad = tconv_out + kernel - 1 - upsample_out + unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1); + unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1); + out_x += padx; + out_y += pady; + + unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; + unsigned int pady_all_except_invallid = + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); + pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; + pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); + pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom; + + TensorShape scale_out_shape(input.tensor_shape()); + scale_out_shape.set(idx_w, out_x); + scale_out_shape.set(idx_h, out_y); + + return scale_out_shape; +} + +/** Calculate the output shape of the transpose convolution layer + * + * @param[in] out_dims Output x and y shape dimensions + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * + * @return the calculated shape + */ +inline TensorShape +compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, + const ITensorInfo &input, const ITensorInfo &weights) +{ + const TensorShape input_shape{input.tensor_shape()}; + const TensorShape weights_shape{weights.tensor_shape()}; + + const DataLayout data_layout = input.data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + TensorShape out_shape{input_shape}; + out_shape.set(width_idx, out_dims.first); + out_shape.set(height_idx, out_dims.second); + out_shape.set(channel_idx, weights_shape[batch_idx]); + return out_shape; +} + +/** Calculate the depth to space output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block) +{ + ARM_COMPUTE_ERROR_ON(block < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(idx_width, input->dimension(idx_width) * block); + output_shape.set(idx_height, input->dimension(idx_height) * block); + output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block)); + + return output_shape; +} + +/** Calculate the space to batch output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block_shape Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON(block_shape < 2); + TensorShape output_shape{input->tensor_shape()}; + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape); + output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape); + output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape)); + + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape, + const TensorShape &indices_shape, uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); + ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); + + TensorShape output_shape = input_shape; + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth, + uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions()); + + TensorShape output_shape; + output_shape.set(actual_axis, depth); + + unsigned int i_shift = 0; + for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i) + { + if (i == actual_axis) + { + i_shift++; + } + output_shape.set(i + i_shift, indices_shape[i]); + } + + return output_shape; +} + +} // namespace shape_calculator +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h new file mode 100644 index 000000000..484ebfd0b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ +#define __ARM_COMPUTE_CLFUNCTIONSEX_H__ + +#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h> +#include <arm_compute/runtime/CL/functions/CLCastBool.h> +#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> +#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h> +#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLNeg.h> +#include <arm_compute/runtime/CL/functions/CLOneHot.h> +#include <arm_compute/runtime/CL/functions/CLReduceOperation.h> +#include <arm_compute/runtime/CL/functions/CLSplitVEx.h> +#include <arm_compute/runtime/CL/functions/CLTopKV2.h> +#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h> + +#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h new file mode 100644 index 000000000..b1ee52bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ +#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ + +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" +#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +class ITensorInfo; +class ICLTensor; + +/** Function to calculate the index of the minimum or maximum values in a + * tensor based on an axis. + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerEx : public IFunction +{ +public: + /** Default Constructor. + * + * @param[in] memory_manager (Optional) Memory manager. + */ + CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[out] output Output source tensor. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + */ + void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerEx + * + * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[in] output Output source tensor info. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<CLTensor> _results_vector; + CLTensor _not_reshaped_output; + std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector; + CLReshapeLayerKernel _reshape_kernel; + unsigned int _num_of_stages; + unsigned int _reduction_axis; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..88a9b00ec --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h new file mode 100644 index 000000000..d6150684a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLCastBool.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCastBool class + */ + +#ifndef ARM_COMPUTE_CLCASTBOOL_H +#define ARM_COMPUTE_CLCASTBOOL_H + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLCastBoolKernel. + * This converts the boolean input tensor to the output tensor's type. + */ +class CLCastBool : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's input and output + * @param[in] input Input tensor. Data types supported: U8 + * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32. + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} +#endif /* ARM_COMPUTE_CLCASTBOOL_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h new file mode 100644 index 000000000..409eaf593 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/CL/functions/CLReverse.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +#include <memory> + +namespace arm_compute +{ +class ICLTensor; +/** Function to run the deconvolution layer. + * + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perform a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input and pad is the amount of padding. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where: + * width_input is the size of the first input dimension. + * height_input is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse. + * + * This function calls the following OpenCL kernels/functions: + * + * -# @ref CLDeconvolutionLayerUpsample + * -# @ref CLConvolutionLayer + * + * And the following CPP kernels: + * -# @ref CLReverse + * + */ +class CLDirectTransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete; + /** Default move constructor */ + CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete; + /** Default move assignment operator */ + CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this + * is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLDirectTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for input + * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + CLDeconvolutionLayerUpsample _scale_f; + CLConvolutionLayer _conv_f; + CLReverse _flip_weights; + + CLTensor _scaled_output; + ICLTensor *_original_weights; + CLTensor _weights_flipped; + CLTensor _flip_axis; + + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..fbee7e40e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h new file mode 100644 index 000000000..f3266f688 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" +#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" +#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls + * the following kernels: + * + * -# @ref CLTransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * S8. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * S8. + * @param[in] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following + * OpenCL kernels: + * + * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric) + * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + CLFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedHybridLayer(const CLFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + CLFullyConnectedHybridLayer(CLFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedHybridLayer &operator=(const CLFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + CLFullyConnectedHybridLayer &operator=(CLFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, + bool retain_internal_weights); + + MemoryGroup _memory_group; + CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel; + CLScaleFactorSymm8Kernel _scale_factor_kernel; + CLQuantizationSymmetricKernel _quant_input_kernel; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLMultiplyScaleFactorKernel _multiply_scale_kernel; + CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to + // add bias in + // CLFullyConnectedHybridLayer + CLTensor _reshape_weights_output; + CLTensor _quantized_input; + CLTensor _scale_factor; + CLTensor _gemmlowp_output; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; + const ICLTensor *_original_weights; +}; +} +#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h new file mode 100644 index 000000000..e65a646dc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls + * the following kernels: + * + * -# @ref CLTransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedLayerReshapeWeightsEx + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +namespace weights_transformations +{ +/** Basic function to manage the reshape weights generated from @ref + * CLFullyConnectedLayerReshapeWeightsEx */ +class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights +{ +public: + // Inherited method override + void run() override + { + _output.allocator()->allocate(); + _func.run(); + _reshape_run = true; + } + + // Inherited method override + void release() override { _output.allocator()->free(); } + + // Inherited method override + ICLTensor *get_weights() override { return &_output; } + + // Inherited method override + uint32_t uid() override { return _uid; } + + /** Configures the @ref CLFullyConnectedLayerReshapeWeightsEx function + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + */ + void configure(const ICLTensor *input) { _func.configure(input, &_output); } + +private: + static constexpr uint32_t _uid = 0x0; + CLTensor _output{}; + CLFullyConnectedLayerReshapeWeightsEx _func{}; +}; +} // namespace weights_transformations + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following + * OpenCL kernels: + * + * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLFullyConnectedLayerReshapeWeightsEx (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref + * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayerEx(const CLFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + CLFullyConnectedLayerEx(CLFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayerEx &operator=(const CLFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + CLFullyConnectedLayerEx &operator=(CLFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + + MemoryGroup _memory_group; + IWeightsManager *_weights_manager; + CLConvertFullyConnectedWeights _convert_weights; + weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; + weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged + _reshape_weights_managed_function; + CLFlattenLayer _flatten_layer; + CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; + CLGEMM _mm_gemm; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLTensor _flatten_output; + CLTensor _converted_weights_output; + CLTensor _reshape_weights_output; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _is_quantized; + bool _is_prepared; + const ICLTensor *_original_weights; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..289ab167f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       CLFullyConnectedReshapingLayer.h + * @brief      This file contains CLFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class CLFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @return N/A + */ + void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + const arm_compute::ICLTensor *_input; + const arm_compute::ICLTensor *_weights; + const arm_compute::ICLTensor *_biases; + arm_compute::ICLTensor *_output; + + // buffer for reshaping input tensor + arm_compute::CLTensor _cl_buffer; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + std::unique_ptr<arm_compute::IFunction> _cl_fc; + CLReshapeLayer _cl_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h new file mode 100644 index 000000000..b01ec4255 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLGatherEx.h + * @brief This file contains CLGatherEx class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLGATHEREX_H__ +#define __ARM_COMPUTE_CLGATHEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to to run @ref CLGatherKernel. + */ +class CLGatherEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGatherEx + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..6618f5aa4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..887e7aaa5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref CLInstanceNormalizationLayerKernelEx + */ +class CLInstanceNormalizationLayerEx : public ICLSimpleFunction +{ +public: + /** Default constructor */ + CLInstanceNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..8ec9aa307 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h new file mode 100644 index 000000000..2bbfca821 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOT_H__ +#define __ARM_COMPUTE_CLONEHOT_H__ +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/runtime/IFunction.h" +namespace arm_compute +{ +class ICLTensor; +/** Basic function to run @ref CLOneHotKernel */ +class CLOneHot : public IFunction +{ +public: + /** Constructor */ + CLOneHot(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot(const CLOneHot &) = delete; + /** Default move constructor */ + CLOneHot(CLOneHot &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot &operator=(const CLOneHot &) = delete; + /** Default move assignment operator */ + CLOneHot &operator=(CLOneHot &&) = default; + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and outputs with off_value being constant + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] off_value The PixelValue for off value. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + + // Inherited methods overridden: + void run() override; + +private: + CLMemsetKernel _memset_kernel; /**< Memset kernel */ + CLOneHotKernel _onehot_kernel; /**< OneHot kernel */ + bool _has_to_memset; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..bb852e404 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + bool keep_dims, ReductionOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReductionOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + MemoryGroup _memory_group; + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + bool _keep_dims; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; + CLReshapeLayer _reshape; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h new file mode 100644 index 000000000..bb741d98d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLSPLITVEX__ +#define __ARM_COMPUTE_CLSPLITVEX__ + +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/CL/functions/CLSlice.h" +#include "arm_compute/core/Types.h" +#include <vector> +#include <memory> + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSplitVKernel */ +class CLSplitVEx : public IFunction +{ +public: + /** Default constructor */ + CLSplitVEx(); + /** Configure the split CL kernel + * + * @param[in] input The input tensor to split. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] size_splits A 1-D tensor containing the number of tensor values per split + * @param[out] outputs A vector containing the output tensor. Data types supported: Same as @p + * input + * The output tensors should match the input tensor dimensions for all + * shape dimensions apart + * from the split dimension. + * @param[in] split_dim Integer value representing the input tensor dimension along which to + * split + * @param[in] num_splits Number of splits + */ + void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits); + + void run() override; + +private: + const ICLTensor *_input; + const ICLTensor *_size_splits; + std::vector<ICLTensor *> _outputs; + unsigned int _num_splits; + std::vector<CLSlice> _slice_functions; +}; +} +#endif /* __ARM_COMPUTE_CLSPLITVEX__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h new file mode 100644 index 000000000..e301a5152 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ +#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ +#define __ARM_COMPUTE_CLTOPK_V2_H__ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute TopKV2 operation. + */ +class CLTopKV2 : public IFunction +{ +public: + /** + * @brief Construct a new CLTopKV2 object + */ + CLTopKV2(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2(const CLTopKV2 &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2 &operator=(const CLTopKV2 &) = delete; + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ + CLTopKV2(CLTopKV2 &&) = default; + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ + CLTopKV2 &operator=(CLTopKV2 &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[in] k The value of `k`. + * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if + * input type is F32. + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type + * is U8/S16, F32 if input type is F32. + * @return N/A + */ + void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits = 32, int bits = 4); + + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ + void run() override; + +private: + void run_on_cpu(); + void run_on_gpu(); + void run_on_gpu_single_quicksort(); + + uint32_t _k; + uint32_t _total_bits; + uint32_t _bits; + uint32_t _radix; + uint32_t _hist_buf_size; + uint32_t _glob_sum_buf_size; + uint32_t _n; + + ICLTensor *_input; + ICLTensor *_values; + ICLTensor *_indices; + + cl::Buffer _qs_idx_buf; + cl::Buffer _qs_temp_buf; + cl::Buffer _hist_buf; + cl::Buffer _glob_sum_buf; + cl::Buffer _temp_buf; + cl::Buffer _first_negative_idx_buf; + cl::Buffer _in_key_buf; + cl::Buffer _out_key_buf; + cl::Buffer _in_ind_buf; + cl::Buffer _out_ind_buf; + + cl::Buffer *_p_in_key_buf; + cl::Buffer *_p_out_key_buf; + cl::Buffer *_p_in_ind_buf; + cl::Buffer *_p_out_ind_buf; +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + CLTopKV2Single _qs_kernel; + CLTopKV2Init _init_kernel; + CLRadixSortHistogram _hist_kernel; + CLRadixSortScanHistogram _scan_hist_kernel; + CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; + CLRadixSortPasteHistogram _paste_hist_kernel; + CLRadixSortReorder _reorder_kernel; + CLTopKV2FindFirstNegative _find_first_negative_kernel; + CLTopKV2ReorderNegatives _reorder_negatives_kernel; + CLTopKV2Store _store_kernel; +#endif +}; +} +#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h new file mode 100644 index 000000000..5fb102e47 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +/** Basic function to compute the deconvolution layer. This function calls the following OpenCL + * kernels/functions: + * + * -# @ref CLGEMMDeconvolutionLayer + * -# @ref CLDirectTransposeConvLayer + */ +class CLTransposeConvLayer : public IFunction +{ +public: + /** Default constructor */ + CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same + * as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this + * is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. Data types supported: + * QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as + * @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is + * described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + static DeconvolutionMethod + get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info); + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + std::unique_ptr<IFunction> _function; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h new file mode 100644 index 000000000..efc296d6c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ +#define __ARM_COMPUTE_NEFUNCTIONSEX_H__ + +#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> +#include <arm_compute/runtime/NEON/functions/NECastBool.h> +#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/NEON/functions/NEGatherEx.h> +#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h> +#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEOneHot.h> +#include <arm_compute/runtime/NEON/functions/NEReduceSum.h> +#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h> +#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h> + +#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h new file mode 100644 index 000000000..026d30098 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +class NEBinaryLogicalOperation : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op); +}; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8 + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8 + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); +}; + +/** Basic function to run equal comparison. */ +using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +/** Basic function to run not equal comparison. */ +using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h new file mode 100644 index 000000000..c8b08af8d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTBOOL_H__ +#define __ARM_COMPUTE_NECASTBOOL_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to run @ref NECastBoolKernel. + */ +class NECastBool : public INESimpleFunction +{ +public: + /** Initialize the function's source, destination + * + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NECastBool + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h new file mode 100644 index 000000000..63f7714aa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file NEEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class NEEmbeddingLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32. + * @return N/A + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] input Source tensor info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * @param[in] output Lookups tensor info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); +}; +} +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h new file mode 100644 index 000000000..56548a479 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls + * the following kernels: + * + * -# @ref NETransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NEQuantizationSymmetricKernel _quant_input_kernel; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEMultiplyScaleFactorKernel _multiply_scale_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _reshape_weights_output; + Tensor _quantized_input; + Tensor _scale_factor; + Tensor _gemmlowp_output; + const ITensor *_original_weights; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h new file mode 100644 index 000000000..8f98f220a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + * @note The difference from NEFullyConnectedLayer is that this class supports weights as input + * with performance loss. + */ +class NEFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFlattenLayerKernel _flatten_kernel; + NEConvertFullyConnectedWeights _convert_weights; + NEFullyConnectedLayerReshapeWeights _reshape_weights_function; + NEGEMM _mm_gemm; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _flatten_output; + Tensor _gemmlowp_output; + Tensor _converted_weights_output; + Tensor _reshape_weights_output; + const ITensor *_original_weights; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; + bool _is_quantized; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..18cb61bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       NEFullyConnectedReshapingLayer.h + * @brief      This file contains NEFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> +#include <arm_compute/runtime/Tensor.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class NEFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @param[in] kernel_type The kernel type for actual FullyConnected layer + * @return N/A + */ + void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, arm_compute::ITensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + const arm_compute::ITensor *_input; + const arm_compute::ITensor *_weights; + const arm_compute::ITensor *_biases; + arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + arm_compute::Tensor _neon_buffer; + +private: + std::unique_ptr<arm_compute::IFunction> _neon_fc; + NEReshapeLayer _neon_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h new file mode 100644 index 000000000..155a1b837 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHEREX_H__ +#define __ARM_COMPUTE_NEGATHEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEGatherKernelEx */ +class NEGatherEx : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h new file mode 100644 index 000000000..521a05ad9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file NEHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class NEHashtableLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] lookups Lookups 1D tensor info. + * Data types supported: S32 + * @param[in] keys Keys 1D tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] hits Hits 1D tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); +}; +} +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..18e813923 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref NEInstanceNormalizationLayerKernelEx + */ +class NEInstanceNormalizationLayerEx : public IFunction +{ +public: + /** Constructor */ + NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEInstanceNormalizationLayerKernelEx _normalization_kernel; + bool _is_nchw; + NEPermute _permute_input; + NEPermute _permute_output; + Tensor _permuted_input; + Tensor _permuted_output; +}; +} +#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h new file mode 100644 index 000000000..b2ea6270f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOT_H__ +#define __ARM_COMPUTE_NEONEHOT_H__ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Basic function to run @ref NEOneHotKernel */ +class NEOneHot : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up + * to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: + * up to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h new file mode 100644 index 000000000..91eec815c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ +#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceOperation : public IFunction +{ +public: + /** Constructor */ + NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output, + ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceOperation + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReductionOperation op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h new file mode 100644 index 000000000..48b416923 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__ +#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceSum : public IFunction +{ +public: + /** Constructor */ + NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h new file mode 100644 index 000000000..24ff5dac9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReverse.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +/** Function to run the deconvolution layer. + * + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perfrom a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finaly a is a user + * specified value where a < stride - 1 that increases the padding top and right of the input image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where + * width is the size of the first input dimension. + * height is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse. + * + * This function calls the following NEON kernels/functions: + * + * -# @ref CPPUpsampleEx + * -# @ref NEConvolutionLayer + * -# @ref NEPermute + * -# @ref NEReverse + * + */ +class NETransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer(const NETransposeConvLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; + /** Allow instances of this class to be moved */ + NETransposeConvLayer(NETransposeConvLayer &&) = default; + /** Allow instances of this class to be moved */ + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + /** Default destructor */ + virtual ~NETransposeConvLayer() = default; + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type + * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 + * for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + /** Static function to check if given info will lead to a valid configuration of @ref + * NETransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types + * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEConvolutionLayer _conv_f; + CPPUpsample _upsample_f; + NEReverse _flip_weights; + Tensor _scaled_output; + Tensor _weights_flipped; + Tensor _flip_axis; + const ITensor *_original_weights; + ITensor *_input; + PadStrideInfo _info; + bool _is_prepared; +}; +} // arm_compute +#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py new file mode 100755 index 000000000..f37c2a957 --- /dev/null +++ b/compute/ARMComputeEx/resolve_includes.py @@ -0,0 +1,116 @@ +# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright (c) 2016, 2017 ARM Limited. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import collections +import os.path +import re +import subprocess +import glob + + +def resolve_includes(target, source): + # File collection + FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') + + # Include pattern + pattern = re.compile("#include \"(.*)\"") + + # Get file contents + files = [] + for i in range(len(source)): + src = source[i] + dst = target[i] + f = open(src) + cts = f.read() + f.close() + contents = cts.splitlines() + entry = FileEntry(target_name=dst, file_contents=contents) + files.append((os.path.basename(src), entry)) + + # Create dictionary of tupled list + files_dict = dict(files) + + # Check for includes (can only be files in the same folder) + final_files = [] + for file in files: + done = False + tmp_file = file[1].file_contents + print(file[1].target_name) + while not done: + file_count = 0 + updated_file = [] + for line in tmp_file: + found = pattern.search(line) + if found: + include_file = found.group(1) + data = files_dict[include_file].file_contents + updated_file.extend(data) + else: + updated_file.append(line) + file_count += 1 + + # Check if all include are replaced. + if file_count == len(tmp_file): + done = True + + # Update temp file + tmp_file = updated_file + + # Append and prepend string literal identifiers and add expanded file to final list + tmp_file.insert(0, "R\"(\n") + tmp_file.append("\n)\"") + entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) + final_files.append((file[0], entry)) + + # Write output files + for file in final_files: + with open(file[1].target_name, 'w+') as out_file: + out_file.write("\n".join(file[1].file_contents)) + + +# Generate embed files +cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') +cl_files += glob.glob('src/core/CL/cl_kernels/*.h') + +# DEBUG: print cl files +print("cl_files:") +print(cl_files) + +embed_files = [f + "embed" for f in cl_files] +print("embed_files:") +print(embed_files) + +resolve_includes(embed_files, cl_files) diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 000000000..81d0cb70f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { + // ARMComputeEx kernels + {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast_bool", "cast.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"one_hot", "one_hot.cl"}, + {"one_hot_only_on_value", "one_hot.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, +}; + +const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { +#ifdef EMBEDDED_KERNELS + { + "arg_min_max_ex.cl", +#include "./cl_kernels/arg_min_max_ex.clembed" + }, + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "gather_ex.cl", +#include "./cl_kernels/gather_ex.clembed" + }, + { + "gemmlowp_ex.cl", +#include "./cl_kernels/gemmlowp_ex.clembed" + }, + { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "instance_normalization_ex.cl", +#include "./cl_kernels/instance_normalization_ex.clembed" + }, + { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "multiply_scale_factor.cl", +#include "./cl_kernels/multiply_scale_factor.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "one_hot.cl", +#include "./cl_kernels/one_hot.clembed" + }, + { + "quantization_symm8.cl", +#include "./cl_kernels/quantization_symm8.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "scale_factor.cl", +#include "./cl_kernels/scale_factor.clembed" + }, + { + "topkv2.cl", +#include "./cl_kernels/topkv2.clembed" + }, + { + "topkv2_radixsort.cl", +#include "./cl_kernels/topkv2_radixsort.clembed" + }, + { + "topkv2_quicksort.cl", +#include "./cl_kernels/topkv2_quicksort.clembed" + }, + +#endif /* EMBEDDED_KERNELS */ +}; + +CLKernelLibraryEx::CLKernelLibraryEx() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ + opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the + // CLKernelLibraryEx is built +} + +CLKernelLibraryEx &CLKernelLibraryEx::get() +{ + static CLKernelLibraryEx _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, + const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if (_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + std::string concat_str; + + if (fp16_supported()) + { + concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; + } + + if (get_cl_version(_device) == CLVersion::CL20) + { + concat_str += " -cl-std=CL2.0 "; + } + else if (arm_non_uniform_workgroup_supported(_device)) + { + concat_str += " -cl-arm-non-uniform-work-group-size "; + } + else + { + ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set) + concat_str; + + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if (_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, + cl::Program program) +{ + _built_programs_map.emplace(built_program_name, program); +} + +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + +const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if (program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if (_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if (std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, + std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if (std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const +{ + std::string concat_set; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for (const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} + +std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) +{ + const auto program_source_it = _program_source_map.find(program_name); + + if (program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); + } + + return program_source_it->second; +} + +size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const +{ + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG( + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); + + return result; +} + +cl::NDRange CLKernelLibraryEx::default_ndrange() const +{ + // GPUTarget _target = get_target_from_device(_device); + cl::Device device = cl::Device::getDefault(); + GPUTarget _target = get_target_from_device(device); + cl::NDRange default_range; + + switch (_target) + { + case GPUTarget::MIDGARD: + case GPUTarget::T600: + case GPUTarget::T700: + case GPUTarget::T800: + default_range = cl::NDRange(128u, 1); + break; + default: + default_range = cl::NullRange; + } + + return default_range; +} + +std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl new file mode 100644 index 000000000..0a014d15c --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(FLOAT_DATA_TYPE) +#define ISGREATER(x, y) isgreater(x, y) +#define ISLESS(x, y) isless(x, y) +#else // !FLOAT_DATA_TYPE +#if defined(WIDTH) +#define ISGREATER(x, y) (x > y) ? 1 : 0 +#define ISLESS(x, y) (x < y) ? 1 : 0 +#else // !defined(WIDTH) +#define ISGREATER(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y) +#define ISLESS(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y) +#endif // defined(WIDTH) +#endif // defined(FLOAT_DATA_TYPE) + +#if defined(ARG_MAX) +#define CONDITION_TO_USE(x, y) ISGREATER(x, y) +#elif defined(ARG_MIN) +#define CONDITION_TO_USE(x, y) ISLESS(x, y) +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) +#error "Unsupported reduction operation!" +#endif // defined(ARG_MAX) + +#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) +#if defined(WIDTH) +#if defined(ARG_MIN) +#if defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res)); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + res = select(res, x_v, *(input + x_v) < *(input + res)); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 <= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = (in.s0123 < in.s4567) || + (in.s0123 == in.s4567 && + CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 < in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MIN) +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + unsigned int res_int = res; + DATA_TYPE_OUTPUT condition_check2; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + int i1 = prev_res[x_v]; + condition_check2 = *(input + i1) > *(input + res_int); + res = select(res, prev_res[x_v], condition_check2); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + unsigned int i1; + unsigned int i2; + DATA_TYPE_OUTPUT condition_check; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + i1 = x_v; + i2 = res; + condition_check = *(input + i1) > *(input + i2); + res = select(res, x_v, condition_check); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 >= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = (in.s0123 > in.s4567) || + (in.s0123 == in.s4567 && + CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 > in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) + +/** This kernel performs parallel reduction given an operation on x-axis. + * + * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed + * using -DPREV_OUTPUT + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the + * ArgMax + * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the + * ArgMin + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] prev_res_ptr (Optional) Pointer to previous results + * tensor. Supported data types: U32/S32 + * @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X + * dimension (in bytes) + * @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y + * dimension (in bytes) + * @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element + * in the previous results tensor + * @param[in] partial_res_ptr The local buffer to hold partial result + * values. Supported data types: U32/S32 + * @param[in] partial_res_stride_x Stride of the output tensor in X dimension + * (in bytes) + * @param[in] partial_res_step_x partial_res_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension + * (in bytes) + * @param[in] partial_res_step_y partial_res_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] local_results Local buffer for storing the partial result + */ +__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), +#if defined(PREV_OUTPUT) + IMAGE_DECLARATION(prev_res), +#endif // defined(PREV_OUTPUT) + IMAGE_DECLARATION(partial_res), + __local DATA_TYPE_OUTPUT *local_results) +{ +#if defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); + Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res); +#else // !defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#endif // defined(PREV_OUTPUT) + Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res); + + unsigned int lsize = get_local_size(0); + unsigned int lid = get_local_id(0); + + const uint x_idx = get_global_id(0); + const uint y_idx = get_global_id(1); + const __global DATA_TYPE *src_in_row = + (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + + y_idx * src_step_y); + + for (unsigned int y = 0; y < get_local_size(1); ++y) + { +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) + local_results[lid] = arg_idx_max_prev_out( + src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#else // defined(ARG_MIN) +#if defined(PREV_OUTPUT) + local_results[lid] = arg_idx_min_prev_out( + src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + barrier(CLK_LOCAL_MEM_FENCE); + + // Looking for the next highest power of 2 (maximum value of lsize is 8) + unsigned int middle = lsize - 1; + middle |= middle >> 1; + middle |= middle >> 2; + middle += 1; + // Perform parallel reduction + DATA_TYPE_OUTPUT condition_check3; + for (unsigned int i = middle; i > 0; i >>= 1) + { + if (lid < i && lid + i < lsize) + { + DATA_TYPE tmp0 = *(src_in_row + local_results[lid]); + DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); +#if defined(ARG_MAX) + condition_check3 = + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); + local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); +#else // defined(ARG_MIN) + local_results[lid] = select( + local_results[lid], local_results[lid + i], + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); +#endif // defined(ARG_MAX) || defined(ARG_MIN) + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0]; + } + } +} +#endif // defined(WIDTH) + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. + * -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (unsigned int y = 1; y < HEIGHT; ++y) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = + CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, y, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif // defined(HEIGHT) + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, z, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, w, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */ +#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..e249663bc --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input2_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + */ +__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#else // OP NOT SUPPORTED + return + +#endif +} +#endif // if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..3b0a175a4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function performs a up-scaling depth conversion for boolean type input. + * + * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note The integer shift amount value need to be passed at compile time using -DSHIFT: + * e.g. -DSHIFT=7 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] in_step_z in_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + */ +__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr); + + VSTORE(VEC_SIZE) + (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..92e5dfbee --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + // lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl new file mode 100644 index 000000000..2236021f1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) + +/** Performs the Gather operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] input_stride_z Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] input_stride_w Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination + * tensor + */ +__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); +#endif +#elif AXIS == 1 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); +#endif +#elif AXIS == 2 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); +#endif +#elif AXIS == 3 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); +#endif +#endif // AXIS + + *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl new file mode 100644 index 000000000..80ba73d1d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \ + defined(COLS_A) +#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X) +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B + * (src1) in case both matrices have not beed reshaped + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data type: + * QASYMM8 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data type: + * same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z, + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx; + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension + // in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + int end_row_vec_a = src_addr.s0 + COLS_A; + + VECTOR_INT acc0 = 0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VECTOR_INT acc1 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VECTOR_INT acc2 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VECTOR_INT acc3 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VECTOR_INT acc4 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + + for (; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y)) + { + // Load values from matrix A + char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + char2 a4 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + // Load values from matrix B + VECTOR_CHAR b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( + 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0; + acc0 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a0.s1; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1.s0; + acc1 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a1.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2.s0; + acc2 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a2.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3.s0; + acc3 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a3.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4.s0; + acc4 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a4.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y)) + { + // Load values from matrix A + char a0 = *(__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + char a1 = *(__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + char a2 = *(__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + char a3 = *(__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + char a4 = *(__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + // Load values from matrix B + VECTOR_CHAR b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + } + + const int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension + // in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint8)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst.ptr += z * dst_stride_z * DEPTH_GEMM3D; + + // Store the result + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst.ptr += z * dst_stride_z; + + // Store the result + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 +#endif // defined(REINTERPRET_OUTPUT_AS_3D) +} +#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && + // defined(COLS_A) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..a4f7dbd48 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS - 1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 000000000..e07a25ec9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#pragma OPENCL EXTENSION cl_arm_printf : enable +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) + +#define GPU_ARCH_MIDGARD 0x100 +#define GPU_ARCH_BIFROST 0x200 + +/** Concatenate two inputs. + * + * @param[in] a The first input to be concatenated + * @param[in] b The second input to be concatenated + * + * @return The concatenated output + */ +#define CONCAT(a, b) a##b + +/** Expand the given vector + * + * @param[in] x The vector to be expanded + * + * @return The expanded output + */ +#define EXPAND(x) x + +/** Clamp the given value between an upper and lower bound. + * + * @param[in] x The value to be clamped + * @param[in] min_val The lower bound + * @param[in] max_val The upper bound + * + * @return The clamped value. + */ +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +/** REVn reverses the given vector whose size is n. + * @name REVn + * + * @param[in] x The vector to be reversed + * + * @return The reversed vector + * @{ + */ +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) +#define REV16(x) ((x).sFEDCBA9876543210) +/** @} */ // end of group REVn + +/** Reverse the given vector. + * @name REVERSE + * + * @param[in] x The vector to be reversed + * @param[in] s The size of the vector + * + * @return The reversed vector + * @{ + */ +#define REVERSE_STR(x, s) REV##s((x)) +#define REVERSE(x, s) REVERSE_STR(x, s) +/** @} */ // end of group REVERSE + +/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. + * @name ROTs_n + * + * @param[in] x The vector to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROT1_0(x) ((x)) + +#define ROT2_0(x) ((x)) +#define ROT2_1(x) ((x).s10) + +#define ROT3_0(x) ((x)) +#define ROT3_1(x) ((x).s201) +#define ROT3_2(x) ((x).s120) + +#define ROT4_0(x) ((x)) +#define ROT4_1(x) ((x).s3012) +#define ROT4_2(x) ((x).s2301) +#define ROT4_3(x) ((x).s1230) + +#define ROT8_0(x) ((x)) +#define ROT8_1(x) ((x).s70123456) +#define ROT8_2(x) ((x).s67012345) +#define ROT8_3(x) ((x).s56701234) +#define ROT8_4(x) ((x).s45670123) +#define ROT8_5(x) ((x).s34567012) +#define ROT8_6(x) ((x).s23456701) +#define ROT8_7(x) ((x).s12345670) + +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_10(x) ((x).s6789ABCDEF012345) +#define ROT16_11(x) ((x).s56789ABCDEF01234) +#define ROT16_12(x) ((x).s456789ABCDEF0123) +#define ROT16_13(x) ((x).s3456789ABCDEF012) +#define ROT16_14(x) ((x).s23456789ABCDEF01) +#define ROT16_15(x) ((x).s123456789ABCDEF0) +/** @} */ // end of group ROTs_n + +/** Circular-right-shift (rotate-right) the given vector by the given amount. + * @name ROTATE + * + * @param[in] x The vector to be shifted + * @param[in] s The size of the vector + * @param[in] n The amount to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROTATE_STR(x, s, n) ROT##s##_##n(x) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +/** @} */ // end of group ROTATE + +/** Creates a vector of size n filled with offset values corresponding to the location of each + * element. + * @name V_OFFSn + * + * @param[in] dt The data type of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define V_OFFS1(dt) (dt)(0) +#define V_OFFS2(dt) (dt)(0, 1) +#define V_OFFS3(dt) (dt)(0, 1, 3) +#define V_OFFS4(dt) (dt)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +/** @} */ // end of group V_OFFSn + +/** Create a vector filled with offset values corresponding to the location of each element. + * @name VEC_OFFS + * + * @param[in] dt The data type of the output vector + * @param[in] s The size of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +/** @} */ // end of group VEC_OFFS + +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short +#define ushort1 ushort +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong +#define double1 double + +#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA + +// Convert built-in functions with _sat modifier are not supported in floating point so we create +// defines +// without _sat to overcome this issue +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 +#define convert_float16_sat convert_float16 +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short +#define convert_ushort1 convert_ushort +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong +#define convert_double1 convert_double + +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_short1_sat convert_short_sat +#define convert_ushort1_sat convert_ushort_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat +#define convert_double1_sat convert_double_sat + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CL_VEC_DATA_TYPE_STR(type, size) type##size +#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0) + +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z, name##_stride_w, name##_step_w, mod_size) + +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ + mod_size) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Structure to hold 4D tensor information */ +typedef struct Tensor4D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ + int stride_w; /**< Stride of the image in W dimension (in bytes) */ +} Tensor4D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x) +{ + Vector vector = { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, + uint step_y, uint stride_z, uint step_z) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z) +{ + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z, uint stride_w, uint step_w, uint mod_size) +{ + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + + (get_global_id(2) / mod_size) * step_w; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +inline __global const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +/** Get the pointer position of a Tensor4D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + * @param[in] w Relative W position + */ +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + w * tensor->stride_w; +} + +#endif // _HELPER_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h new file mode 100644 index 000000000..5f1b3f902 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPERS_ASYMM_H +#define ARM_COMPUTE_HELPERS_ASYMM_H + +#include "helpers.h" + +/** Convert the given vector with round to nearest even rounding mode + * + * @param[in] x The target to be converted + * @param[in] type The target type + * + * @return The converted vector + */ +#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) + +/** Quantize a floating-point scalar value to 8-bit asymmetric + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline uchar quantize_qasymm8(float input, float offset, float scale) +{ + float out_f32 = input / scale + offset; + uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); + return res_u8; +} + +/** Dequantize a scalar value from 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8(uchar input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8_signed(char input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Quantize a vector of values from floating-point + * + * @param[in] type Output data type. + * @param[in] size Size of vector. + * + * @return quantized values + */ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ + VEC_DATA_TYPE(type, size)); \ + return res; \ + } + +/** Dequantize a vector of values to floating-point + * + * @param[in] type Input data type. + * @param[in] size Size of vector. + * + * @return dequantized values in floating point + */ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ + } + +/** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + +/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), + * rounding to the nearest value, and saturating -1 * -1 to the maximum value. + * + * @param[in] size Size of vector. + * + * @return Product of two fixed-point numbers. + */ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ + } + +/** Each bit of the result is set to the corresponding bit of either then_val or + * else_val depending on whether the corresponding bit of if_mask is set. + * Equivalent to the VBSL instruction in ARM NEON. + * + * @param[in] size Size of vector. + * + * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding + * bit in @p if_mask is set or not. + */ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ + VEC_DATA_TYPE(int, size) then_val, \ + VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is zero. + */ +#define ASYMM_MASK_IF_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a == 0); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is non-zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is non zero. + */ +#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a != 0); \ + } + +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK( \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ + } + +/** Calculates \f$ exp(x) \f$ for x < 0. + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ + a_mod_quarter_minus_one_quarter_scaled, size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ + size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ + } + +/** Calculates the product of a integer value by a power of two, with either a positive exponent + * (equivalent to an arithmetic left shift, saturating) or a negative exponent + * (equivalent to an arithmetic right shift, rounding to nearest). + * + * @param[in] size Size of vector. + * + * @return Arithmetic left or right shift. + */ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ + } + +/** Calculates (a+b)/2, rounded to the nearest integer. + * Equivalent to VRHADD in the ARM NEON instruction set. + * + * @param[in] size Size of vector. + * + * @return (a+b)/2, rounded to the nearest integer. + */ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ + } + +/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ + VEC_DATA_TYPE(int, size) \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ + const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ + VEC_DATA_TYPE(int, size) \ + x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ + for (int i = 0; i < 3; i++) \ + { \ + VEC_DATA_TYPE(int, size) \ + half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ + VEC_DATA_TYPE(int, size) \ + one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ + VEC_DATA_TYPE(int, size) \ + tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ + x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ + } \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ + } + +/** Considering the integer value as fixed-point, change the number of integer bits and update value + * accordingly. + * + * @param[in] size Size of vector. + * + * @return Rescaled value. + */ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ + int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE_STR(input, offset, scale, type, size) \ + dequantize_##type##size(input, offset, scale) +#define DEQUANTIZE(input, offset, scale, type, size) \ + DEQUANTIZE_STR(input, offset, scale, type, size) + +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) +#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ + asymm_exp_on_negative_values##size(a, k_integer_bits) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ + } +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) + +QUANTIZE_IMPL(uchar, 1) +QUANTIZE_IMPL(char, 1) +QUANTIZE_IMPL(uint, 1) +QUANTIZE_IMPL(int, 1) +QUANTIZE_IMPL(uchar, 4) +QUANTIZE_IMPL(ushort, 4) +QUANTIZE_IMPL(short, 4) +QUANTIZE_IMPL(uchar, 16) +QUANTIZE_IMPL(char, 16) +QUANTIZE_IMPL(ushort, 16) +QUANTIZE_IMPL(short, 16) +QUANTIZE_IMPL(uint, 16) +QUANTIZE_IMPL(int, 16) + +DEQUANTIZE_IMPL(uchar, 1) +DEQUANTIZE_IMPL(char, 1) +DEQUANTIZE_IMPL(uint, 1) +DEQUANTIZE_IMPL(int, 1) +DEQUANTIZE_IMPL(uchar, 4) +DEQUANTIZE_IMPL(ushort, 4) +DEQUANTIZE_IMPL(short, 4) +DEQUANTIZE_IMPL(uchar, 16) +DEQUANTIZE_IMPL(char, 16) +DEQUANTIZE_IMPL(ushort, 16) +DEQUANTIZE_IMPL(short, 16) +DEQUANTIZE_IMPL(uint, 16) +DEQUANTIZE_IMPL(int, 16) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + +ASYMM_MULT_IMPL(1) +ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(4) +ASYMM_MULT_IMPL(8) +ASYMM_MULT_IMPL(16) + +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + +ASYMM_SELECT_USING_MASK_IMPL(1) +ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(4) +ASYMM_SELECT_USING_MASK_IMPL(8) +ASYMM_SELECT_USING_MASK_IMPL(16) + +ASYMM_MASK_IF_ZERO_IMPL(1) +ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(4) +ASYMM_MASK_IF_ZERO_IMPL(8) +ASYMM_MASK_IF_ZERO_IMPL(16) + +ASYMM_MASK_IF_NON_ZERO_IMPL(1) +ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(4) +ASYMM_MASK_IF_NON_ZERO_IMPL(8) +ASYMM_MASK_IF_NON_ZERO_IMPL(16) + +EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(4) +EXP_BARREL_SHIFTER_IMPL(8) +EXP_BARREL_SHIFTER_IMPL(16) + +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) + +ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(4) +ASYMM_ROUNDING_HALF_SUM_IMPL(8) +ASYMM_ROUNDING_HALF_SUM_IMPL(16) + +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + +ASYMM_RESCALE_IMPL(1) +ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(4) +ASYMM_RESCALE_IMPL(8) +ASYMM_RESCALE_IMPL(16) + +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) + +#endif // ARM_COMPUTE_HELPERS_ASYMM_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl new file mode 100644 index 000000000..014842680 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) +/** This function normalizes the input 2D tensor across the first dimension with respect to mean and + * standard deviation of the same dimension. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. + * -DDATA_TYPE=float + * @attention Normalization epsilon parameter should be given as a preprocessor argument with + * -DEPSILON=value. e.g. -DEPSILON=0.001f + * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, + * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr (Optional) Pointer to the destination tensor. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x (Optional) Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y (Optional) output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z (Optional) output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in + * the destination tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor. + * Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X + * dimension (in bytes) + * @param[in] gamma_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in + * the gamma tensor + * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported + * data types: same as @p input_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X + * dimension (in bytes) + * @param[in] beta_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in + * the beta tensor + */ +__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR4D_DECLARATION(output) +#endif /* IN_PLACE */ +#ifdef GAMMA + , + VECTOR_DECLARATION(gamma) +#endif // GAMMA +#ifdef BETA + , + VECTOR_DECLARATION(beta) +#endif // BETA + ) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +#ifndef IN_PLACE + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +#endif /* IN_PLACE */ + + float sum = 0.f; + float sum_sq = 0.f; + +#if defined(NHWC) + + const int ch = get_global_id(0); // Current channel + const int batch = get_global_id(2); // Current batch + const int elements_plane = DIM_Y * DIM_Z; + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); + sum += data; + sum_sq += data * data; + } + } + +#else // !defined(NHWC) + const int ch = get_global_id(2) % DIM_Z; // Current channel + const int batch = get_global_id(2) / DIM_Z; // Current batch + const int elements_plane = DIM_X * DIM_Y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum = 0.f; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum_sq = 0.f; + // Calculate partial sum + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum += data; + part_sum_sq += data * data; + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum.s0 += data; + part_sum_sq.s0 += data * data; + } + } +// Perform reduction +#if VEC_SIZE > 8 + part_sum.s01234567 += part_sum.s89abcdef; + part_sum_sq.s01234567 += part_sum_sq.s89abcdef; +#endif // VEC_SIZE > 8 +#if VEC_SIZE > 4 + part_sum.s0123 += part_sum.s4567; + part_sum_sq.s0123 += part_sum_sq.s4567; +#endif // VEC_SIZE > 4 +#if VEC_SIZE > 2 + part_sum.s01 += part_sum.s23; + part_sum_sq.s01 += part_sum_sq.s23; +#endif // VEC_SIZE > 2 + part_sum.s0 += part_sum.s1; + part_sum_sq.s0 += part_sum_sq.s1; + + sum = (float)part_sum.s0; + sum_sq = (float)part_sum_sq.s0; + +#endif // defined(NHWC) + + const float mean_float = (sum / elements_plane); + const DATA_TYPE mean = (DATA_TYPE)mean_float; + const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float); +#if defined(GAMMA) + const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON); + const DATA_TYPE multip = (DATA_TYPE)multip_float; +#else // !defined(GAMMA) + const DATA_TYPE multip = (DATA_TYPE)0; +#endif // defined(GAMMA) +#if defined(BETA) + const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch); +#else // !defined(BETA) + const DATA_TYPE beta = 0; +#endif // defined(BETA) + +#if defined(NHWC) + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } + +#else // !defined(NHWC) + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, input_address); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = (data - mean) * multip + beta; + VSTORE(VEC_SIZE) + (res, 0, output_address); + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } +#endif // defined(NHWC) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl new file mode 100644 index 000000000..3943fc4c2 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) + +/** This performs to multiply input by scale_factor. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Quantization scale of input tensor is passed in with -DSCALE=scale. + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S8 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] scale_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] scale_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] scale_step_x scale_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] scale_offset_first_element_in_bytes The offset of the first element in the scale + * tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale), + IMAGE_DECLARATION(output), float multiplier) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global int *)input.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + + // Create scale vector + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + vscale = *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)); + + // Dequantize + vscale *= (DATA_TYPE)(multiplier); + val *= vscale; + + // Store result + VSTORE(VEC_SIZE) + (val, 0, (__global DATA_TYPE *)output.ptr); +#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE *)(output.ptr)) = + ((DATA_TYPE)(*((__global int *)(input.ptr)))) * + *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) +} + +#endif // defined(VEC_SIZE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..15c16f80c --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination + * image + * + */ +__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl new file mode 100644 index 000000000..c274aba62 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) + +/** Performs the OneHot operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[in] off_value_ptr Pointer to the off_value vector. Supported + * data types: Same as @p on_value. + * @param[in] off_value_stride_x Stride of the off_value vector in X + * dimension (in bytes) + * @param[in] off_value_step_x off_value_stride_x * number of elements + * along X processed per work item (in bytes) + * @param[in] off_value_offset_first_element_in_bytes Offset of the first element in the off_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 + const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw); + *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#endif // AXIS +} + +/** Performs the OneHot operation along the chosen axis as off_value being zero + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2); + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z); + + const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz); + + if (index < 0 || index >= DEPTH) + return; + +#if AXIS == 0 + *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 1 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 2 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 3 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) = + *((__global const DATA_TYPE *)on_value_ptr); +#endif // AXIS +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl new file mode 100644 index 000000000..76fda9041 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers_asymm.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of + * GEMMLowp to QASYMM8 + * + * The following computations will be performed by the kernel: + * + * -# Add offset terms to inputs + * -# Multiply inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and + * -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + * -DRESULT_SHIFT + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + + // Multiply with a multiplier smaller than 1 + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl new file mode 100644 index 000000000..4ae9adb0b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x))) +#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size) +#define MIN_QUANT_VAL -127 +#define MAX_QUANT_VAL 127 + +#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) + +/** This performs the quantization of floating point inputs to 8-bit unsigned integers. + * + * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. + * -DDATA_TYPE=short + * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. + * e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. + * -DSCALE=0.125 + * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. + * -DOFFSET=125 + * @note Minimum value for quantized type should be given as a preprocessor argument using + * -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0 + * @note Maximum value for quantized type should be given as a preprocessor argument using + * -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: S8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] scale_ptr Pointer to the scale tensor. Supported data + * types: F32 + * @param[in] scale_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] scale_step_x scale_stride_x * number of elements along X + * processed per workitem(in bytes) + */ +__kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale), + IMAGE_DECLARATION(output)) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + + // Create scale vector + const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = + *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); + + // Quantize + VEC_DATA_TYPE(int, VEC_SIZE) + res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); +#else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP( + CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / + (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), + int), + MIN_QUANT_VAL, MAX_QUANT_VAL); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..832ac1270 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#else // OP NOT SUPPORTED + return; + +#endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for (int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *( + (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + +#if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + +#elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + +#else // OP NOT SUPPORTED + return; + +#endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl new file mode 100644 index 000000000..3d5e90356 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(WIDTH) +/** This function identifies the min and maximum value of an input 3D tensor. + * + * @note The width, height and depth of the input tensor must be provided at compile time using + * -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: + * F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] dst_ptr Pointer to the min/max vector. Minimum value in + * position 0, maximum value in position 1. Supported data types: F32. + * @param[in] dst_stride_x Stride of the min/max vector in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max + * vector + */ +__kernel void scale_factor_symm8(IMAGE_DECLARATION(src), VECTOR_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + float4 min_value = (float4)FLT_MAX; + float4 max_value = (float4)-FLT_MAX; + + int x = 0; + __global float *src_addr = (__global float *)(src.ptr); + + for (; x <= (int)(WIDTH - 8); x += 8) + { + float8 value = vload8(0, (__global float *)(src_addr + x)); + + min_value = select(value.s0123, min_value, min_value < value.s0123); + min_value = select(value.s4567, min_value, min_value < value.s4567); + + max_value = select(value.s0123, max_value, max_value > value.s0123); + max_value = select(value.s4567, max_value, max_value > value.s4567); + } + + for (; x < WIDTH; ++x) + { + float value = *(src_addr + x); + + min_value.s0 = min(min_value.s0, value); + max_value.s0 = max(max_value.s0, value); + } + + // Perform min/max reduction + min_value.s01 = min(min_value.s01, min_value.s23); + min_value.s0 = min(min_value.s0, min_value.s1); + max_value.s01 = max(max_value.s01, max_value.s23); + max_value.s0 = max(max_value.s0, max_value.s1); + + // Extract scale + max_value.s0 = max(fabs(min_value.s0), fabs(max_value.s0)) / 127.0f; + + // Store min and max + *((__global float *)(dst_ptr) + get_global_id(1)) = max_value.s0; +} +#endif // defined(WIDTH) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl new file mode 100644 index 000000000..3eb1a4ce7 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf, + __global int *in_ind_buf, const int n) +{ + int gid = get_global_id(0); + int lws = get_local_size(0); + int groups = get_num_groups(0); + int gws = lws * groups; + int iter = n / gws; + + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + for (int i = 0; i < iter; ++i) + { + int idx = i * gws + gid; + in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x); + in_ind_buf[idx] = idx; + } +} + +__kernel void topkv2_find_first_negative(__global float *out_key_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + if (gid == n - 1) + { + // if the last item is positive, the first negative index is n. + if (out_key_buf[gid] > 0.f) + *first_negative_idx = n; + } + else if (gid == 0) + { + // if the first item is negative, set it 0. + if (out_key_buf[gid] < 0.f) + *first_negative_idx = 0; + } + else + { + // if its left is positive and it is negative, then it is the first negative item. + if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f) + *first_negative_idx = gid; + } +} + +__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf, + __global float *in_ind_buf, __global float *out_ind_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + int num_negs = n - *first_negative_idx; + int in_idx; + + if (gid < num_negs) + { + in_idx = n - 1 - gid; + } + else + { + in_idx = gid - num_negs; + } + + out_key_buf[gid] = in_key_buf[in_idx]; + out_ind_buf[gid] = in_ind_buf[in_idx]; +} + +__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices), + __global float *out_key_buf, __global int *out_ind_buf, int n) +{ + int gid = get_global_id(0); + + Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); + Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + + int idx = n - 1 - gid; + + *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl new file mode 100644 index 000000000..460de790b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__global inline float *get_vec_elem(Vector *vec, int idx) +{ + return (__global float *)(vec->ptr + idx * vec->stride_x); +} + +__global inline int *get_vec_elem_int(Vector *vec, int idx) +{ + return (__global int *)(vec->ptr + idx * vec->stride_x); +} + +// A utility function to swap two elements +void swap(__global float *a, __global float *b) +{ + float t = *a; + *a = *b; + *b = t; +} + +void swap_idx(__global int *a, __global int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* This function is same in both iterative and recursive*/ +int partition(Vector *arr, __global int *indices, int l, int h) +{ + float x = *get_vec_elem(arr, h); + int i = (l - 1); + + for (int j = l; j <= h - 1; j++) + { + if (*get_vec_elem(arr, j) >= x) + { + i++; + swap(get_vec_elem(arr, i), get_vec_elem(arr, j)); + swap_idx(&indices[i], &indices[j]); + } + } + swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); +} + +/* A[] --> Array to be sorted, + l --> Starting index, + h --> Ending index */ +void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h) +{ + // Create an auxiliary stack + + // initialize top of stack + int top = -1; + + // push initial values of l and h to stack + stack[++top] = l; + stack[++top] = h; + + // Keep popping from stack while is not empty + while (top >= 0) + { + // Pop h and l + h = stack[top--]; + l = stack[top--]; + + // Set pivot element at its correct position + // in sorted array + int p = partition(arr, indices, l, h); + + // If there are elements on left side of pivot, + // then push left side to stack + if (p - 1 > l) + { + stack[++top] = l; + stack[++top] = p - 1; + } + + // If there are elements on right side of pivot, + // then push right side to stack + if (p + 1 < h) + { + stack[++top] = p + 1; + stack[++top] = h; + } + } +} + +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values), + VECTOR_DECLARATION(topk_indices), __global int *indices, + __global int *temp_stack, int k, int n) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); + Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); + + for (int i = 0; i < n; ++i) + { + indices[i] = i; + } + + quickSortIterative(&input, indices, temp_stack, 0, n - 1); + + // extract k items. + for (int i = 0; i < k; ++i) + { + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem_int(&topk_indices, i) = indices[i]; + } +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl new file mode 100644 index 000000000..e9d4696b4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// reference: +// https://code.google.com/archive/p/ocl-radix-sort/source/default/source +// OpenCL kernel sources for the CLRadixSort class +// the #include does not exist in OpenCL +// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr +// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html +// if you find this software usefull you can cite the following work in your reports or articles: +// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. +// http://hal.archives-ouvertes.fr/hal-00596730 + +// Reference for floating point radix sort: +// http://www.codercorner.com/RadixSortRevisited.htm + +// compute the histogram for each radix and each virtual processor for the pass +__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, + const int pass, __local int *loc_histo, const int n) +{ + int it = get_local_id(0); // i local number of the processor + int ig = get_global_id(0); // global number = i + g I + + int gr = get_group_id(0); // g group number + + int groups = get_num_groups(0); + int items = get_local_size(0); + + // set the local histograms to zero + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // range of keys that are analyzed by the work item + int size = n / groups / items; // size of the sub-list + int start = ig * size; // beginning of the sub-list + + unsigned int key; + int shortkey, k; + + // compute the index + // the computation depends on the transposition + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + + key = *((__global unsigned int *)(in_key_buf + k)); + + // extract the group of _BITS bits of the pass + // the result is in the range 0.._RADIX-1 + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + // increment the local histogram + loc_histo[shortkey * items + it]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // copy the local histogram to the global one + for (int ir = 0; ir < _RADIX; ir++) + { + d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// initial transpose of the list for improving +// coalescent memory access +__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, + const int nbrow, const __global int *inperm, __global int *outperm, + __local int *blockmat, __local int *blockperm, const int tilesize) +{ + + int i0 = get_global_id(0) * tilesize; // first row index + int j = get_global_id(1); // column index + + int jloc = get_local_id(1); // local column index + + // fill the cache + for (int iloc = 0; iloc < tilesize; iloc++) + { + int k = (i0 + iloc) * nbcol + j; // position in the matrix + blockmat[iloc * tilesize + jloc] = invect[k]; +#ifdef PERMUT + blockperm[iloc * tilesize + jloc] = inperm[k]; +#endif + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first row index in the transpose + int j0 = get_group_id(1) * tilesize; + + // put the cache at the good place + for (int iloc = 0; iloc < tilesize; iloc++) + { + int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose + outvect[kt] = blockmat[jloc * tilesize + iloc]; +#ifdef PERMUT + outperm[kt] = blockperm[jloc * tilesize + iloc]; +#endif + } +} + +// each virtual processor reorders its data using the scanned histogram +__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, + __global int *d_Histograms, const int pass, + __global int *indices_in, __global int *indices_out, + __local int *loc_histo, const int n) +{ + + int it = get_local_id(0); + int ig = get_global_id(0); + + int gr = get_group_id(0); + int groups = get_num_groups(0); + int items = get_local_size(0); + + int start = ig * (n / groups / items); + int size = n / groups / items; + + // take the histogram in the cache + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int newpos, shortkey, k, newpost; + unsigned int key; + + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + float org_value = in_key[k]; + key = *(__global unsigned int *)(in_key + k); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + newpos = loc_histo[shortkey * items + it]; + +#ifdef TRANSPOSE + int ignew, jnew; + ignew = newpos / (n / groups / items); + jnew = newpos % (n / groups / items); + newpost = jnew * (groups * items) + ignew; +#else + newpost = newpos; +#endif + + // d_outKeys[newpost]= key; // killing line !!! + out_key[newpost] = org_value; + +#ifdef PERMUT + indices_out[newpost] = indices_in[k]; +#endif + + newpos++; + loc_histo[shortkey * items + it] = newpos; + } +} + +// perform a parallel prefix sum (a scan) on the local histograms +// (see Blelloch 1990) each workitem worries about two memories +// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, + __global int *globsum) +{ + int it = get_local_id(0); + int ig = get_global_id(0); + int decale = 1; + int n = get_local_size(0) * 2; + int gr = get_group_id(0); + + // load input into local memory + // up sweep phase + temp[2 * it] = histo[2 * ig]; + temp[2 * it + 1] = histo[2 * ig + 1]; + + // parallel prefix sum (algorithm of Blelloch 1990) + for (int d = n >> 1; d > 0; d >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + temp[bi] += temp[ai]; + } + decale *= 2; + } + + // store the last element in the global sum vector + // (maybe used in the next step for constructing the global scan) + // clear the last element + if (it == 0) + { + globsum[gr] = temp[n - 1]; + temp[n - 1] = 0; + } + + // down sweep phase + for (int d = 1; d < n; d *= 2) + { + decale >>= 1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write results to device memory + + histo[2 * ig] = temp[2 * it]; + histo[2 * ig + 1] = temp[2 * it + 1]; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// use the global sum for updating the local histograms +// each work item updates two values +__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) +{ + int ig = get_global_id(0); + int gr = get_group_id(0); + + int s; + + s = globsum[gr]; + + // write results to device memory + histo[2 * ig] += s; + histo[2 * ig + 1] += s; + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp new file mode 100644 index 000000000..047004d5e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +constexpr unsigned int vector_size = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, + DataType::S64); + } + if (prev_output != nullptr && prev_output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, + DataType::S32, DataType::S64); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output); + } + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *prev_output, + ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_UNUSED(op); + // Output tensor auto initialization if not yet initialized + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(axis, 1); + DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), + Steps(vector_size)); + bool window_changed = false; + + switch (axis) + { + case 0: + { + ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input; + AccessWindowStatic input_access(input_tensor_access, 0, 0, + static_cast<int>(input_tensor_access->dimension(0)), 1); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + case 1: + case 2: + case 3: + { + AccessWindowHorizontal input_access(input, 0, vector_size); + AccessWindowHorizontal output_access(output, 0, vector_size); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_tuple(err, win); +} +} // namespace + +CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() + : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), + _op(ReductionOperation::ARG_IDX_MAX) +{ +} + +void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output, + ICLTensor *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, + output->info(), axis, op)); + auto win_config = validate_and_configure_window( + input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, + op); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + _input = input; + _prev_output = prev_output; + _output = output; + _reduction_axis = axis; + _op = op; + + // Set build options + CLBuildOptions build_opts; + + build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT"); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); + build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); + build_opts.add_option("-DDATA_TYPE_OUTPUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_SELECT=" + + get_cl_signed_type_from_element_size(input->info()->element_size())); + + // Create kernel + cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); + std::string kernel_axis_name; + switch (axis) + { + case 0: + { + const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input; + build_opts.add_option("-DWIDTH=" + + support::cpp11::to_string(input_for_width->info()->dimension(0))); + + kernel_axis_name = "x"; + lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), + vector_size); + } + break; + case 1: + build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + kernel_axis_name = "y"; + break; + case 2: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + kernel_axis_name = "z"; + break; + case 3: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); + kernel_axis_name = "w"; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); + + // Configure kernel window + ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); +} + +Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, + output->clone().get(), axis, op))); + return Status{}; +} + +void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + switch (_reduction_axis) + { + case 0: + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Reshape window + const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2; + + // Set local sums buffer + unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + if (_prev_output != nullptr) + { + add_2D_tensor_argument(idx, _prev_output, in_slice); + } + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + break; + case 1: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), + _input->info()->dimension(1))); + Window in_slice = window_in.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_2D(in_slice) && + window.slide_window_slice_2D(out_slice)); + } + break; + case 2: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), + _input->info()->dimension(2))); + Window in_slice = window_in.first_slice_window_3D(); + Window out_slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_3D(in_slice) && + window.slide_window_slice_3D(out_slice)); + } + break; + case 3: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(3, Window::Dimension(0, 1, 1)); + Window in_slice = window_in.first_slice_window_4D(); + Window out_slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, in_slice); + add_4D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_4D(in_slice) && + window.slide_window_slice_4D(out_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..fbc76f5e1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp new file mode 100644 index 000000000..6e0bcde7f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "support/StringSupport.h" + +#include <cstddef> +#include <set> +#include <string> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::U32, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), + "Input and output data types must be different"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Get number of elements to process per iterations + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + const std::string kernel_name = "cast_bool"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); + + // Collapse window + const Window &full_window = window(); + Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ); + ICLKernel::configure_internal(collapsed_window); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(output->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); +} + +Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..67aaf2db6 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp new file mode 100644 index 000000000..3bfe3e407 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/core/UtilsEx.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ + +inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, + ITensorInfo *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + std::unique_ptr<ITensorInfo> output_info = input->clone(); + output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis)); + // Output auto initialization if not yet initialized + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); + + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} + +} // namespace + +CLGatherExKernel::CLGatherExKernel() + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +{ +} + +void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices, + ICLTensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), indices->info(), output->info(), axis)); + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input = input; + _output = output; + _indices = indices; + _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions())); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} + +Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + indices->clone().get(), + output->clone().get(), axis) + .first); + return Status{}; +} + +void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..930e7c944 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() +{ + // DO NOTHING +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) || + output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..61c14d271 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_UNUSED(gamma); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) +{ +} + +void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + _run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), + gamma ? gamma->info() : nullptr, + beta ? beta->info() : nullptr, epsilon)); + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); + build_opts.add_option_if(gamma, "-DGAMMA"); + build_opts.add_option_if(beta, "-DBETA"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window collapsed_window = window.collapse(window, Window::DimZ); + + // We will process the planes together + if (_input->info()->data_layout() == DataLayout::NCHW) + { + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + } + else + { + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1)); + } + + Window vec_window; + vec_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, collapsed_window); + if (!_run_in_place) + { + add_4D_tensor_argument(idx, _output, collapsed_window); + } + if (_gamma) + { + add_1D_tensor_argument(idx, _gamma, vec_window); + } + if (_beta) + { + add_1D_tensor_argument(idx, _beta, vec_window); + } + + enqueue(queue, *this, collapsed_window, lws_hint()); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..6b27c9917 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); + + // CLMultiplyScaleFactorKernel doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_tuple(Status{}, win); +} +} // namespace + +CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor, + ICLTensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + const int vec_size_x = 16 / output->info()->element_size(); + const int output_width_x = output->info()->tensor_shape().x(); + const bool multi_access_x = (output_width_x / vec_size_x > 0); + + // Create and update the window (if needed) + Window win = calculate_max_window(*output->info()); + if (multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), + vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); +} + +Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + return Status{}; +} + +void CLMultiplyScaleFactorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + + // Set scale_factor window + Window win_scale = calculate_max_window(*_scale_factor->info(), Steps()); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _scale_factor, win_scale); + add_2D_tensor_argument(idx, _output, slice); + _kernel.setArg<float>(idx++, _multiplier); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..643c8b110 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp new file mode 100644 index 000000000..35d70d689 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "support/StringSupport.h" +#include <string> +namespace arm_compute +{ +namespace +{ +inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, + const ITensorInfo *on_value, + ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + auto_init_if_empty((*output), output_shape, 1, on_value->data_type()); + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace +CLOneHotKernel::CLOneHotKernel() + : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), + _is_off_value_memset(false) +{ +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = false; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info()); + ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + _off_value = off_value; + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = true; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output); + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); + // Configure kernel window + auto win_config = + validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + if (_is_off_value_memset) + { + // Replace window with calculated by infices info + win_config.second = calculate_max_window(*indices->info(), Steps()); + } + _indices = indices; + _on_value = on_value; + _output = output; + const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size( + data_size_from_type(on_value->info()->data_type()))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth)); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + // Create kernel + const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value); + ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + unsigned int idx = 0; + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_1D_tensor_argument(idx, _on_value, window_collapsed); + if (!_is_off_value_memset) + { + add_1D_tensor_argument(idx, _off_value, window_collapsed); + } + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..1a7a18cfa --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, scale_factor); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Output must always be initialized + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + const int vec_size_x = 16 / input->element_size(); + const int input_width_x = input->tensor_shape().x(); + const bool multi_access_x = (input_width_x / vec_size_x > 0); + + if (multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), + vec_size_x)); + } + + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +} // namespace + +CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr) +{ +} + +void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor, + ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + + const int vec_size_x = 16 / input->info()->element_size(); + const int input_width_x = input->info()->tensor_shape().x(); + const bool multi_access_x = (input_width_x / vec_size_x > 0); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); +} + +Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get()).first); + + return Status{}; +} + +void CLQuantizationSymmetricKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Support only 2D + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + + do + { + Window scale_slice = slice.shift_dimensions(1); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _scale_factor, scale_slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..3fbebf25a --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReductionOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReductionOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReductionOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReductionOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReductionOperation::MEAN_SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp new file mode 100644 index 000000000..8d8853c81 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "support/StringSupport.h" + +#include <climits> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + TensorShape output_shape = TensorShape{input->dimension(1)}; + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + TensorShape output_shape = TensorShape{input->dimension(1)}; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, output_shape, 1, input->data_type()); + + const unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output, 0, 0, output->dimension(0), 1); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_tuple(err, win); +} +} // namespace + +CLScaleFactorSymm8Kernel::CLScaleFactorSymm8Kernel() : _input(nullptr), _output(nullptr) {} + +void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + std::set<std::string> build_opts; + build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); + + auto win_config = validate_and_configure_window(input->info(), output->info()); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + + return Status{}; +} + +void CLScaleFactorSymm8Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + do + { + Window output_slice = slice.shift_dimensions(1); + + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _output, output_slice); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..151d45e8d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute +#endif // Disable GPU implementation diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp new file mode 100644 index 000000000..dfe5d59b0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" + +#include <algorithm> +#include "arm_compute/core/Types.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +namespace +{ + +using namespace arm_compute; +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op_templ( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_value, + output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = + reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = + reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, + input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +} // namespace + +namespace arm_compute +{ + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)) +{ + elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) +{ + elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp new file mode 100644 index 000000000..32d7d6237 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace arm_compute +{ + +template <BinaryLogicalOperation op, typename ScalarType> +inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case BinaryLogicalOperation::AND: + res = a & b; + break; + case BinaryLogicalOperation::OR: + res = a | b; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op, typename VectorType> +inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + + switch (op) + { + case BinaryLogicalOperation::AND: + res = wrapper::vand(a, b); + break; + case BinaryLogicalOperation::OR: + res = wrapper::vorr(a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op> +inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) +{ + uint8x16x4_t out = {{ + elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_logic_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, + &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_logic_op_loop<op, ScalarType, VectorType>); +} + +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window) { func(input1, input2, output, window); }; + } + return nullptr; +} + +template <BinaryLogicalOperation op> +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> +configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + + return configure_func(input1, input2, output, map_function); +} + +void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, + const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); + configure_common(input1, input2, output); + switch (op) + { + case BinaryLogicalOperation::AND: + _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); + break; + case BinaryLogicalOperation::OR: + _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) +{ + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, + DataType::QASYMM8); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); + return Status{}; +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp new file mode 100644 index 000000000..12017e543 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/SaturateCast.h" + +#include "arm_compute/core/NEON/wrapper/wrapper.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {} + +void NECastBoolKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + _input = input; + _output = output; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICPPKernel::configure(win); +} + +Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + +void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output); + ARM_COMPUTE_ERROR_ON(_input == _output); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win); + Iterator output(_output, win); + + const uint8_t true_val = 1; + const uint8x8_t mask_bool = vdup_n_u8(true_val); + + switch (_output->info()->data_type()) + { + case DataType::S8: + { + /* Conversion U8 -> S8 */ + execute_window_loop(win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8( + texels_u8, vdupq_n_u8(true_val)))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s16(output_ptr + x, texels.val[0]); + vst1q_s16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::F32: + { + /* Up-conversion U8 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(output_ptr + x + 12, + vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); + *(output_ptr + x) = static_cast<float>(in); + } + }, + input, output); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Up-conversion U8 -> F16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::U8: + { + /* Conversion U8 -> S8 */ + execute_window_loop(win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), + vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; + + vst1q_u16(output_ptr + x, texels.val[0]); + vst1q_u16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..091d38c56 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() + : _input(nullptr), _lookups(nullptr), _output(nullptr) +{ +} + +void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output, + const ITensor *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Auto initialize output if not initialized + auto out_shape = input->info()->tensor_shape(); + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions()); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + const arm_compute::ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + return Status{}; +} + +void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const int32_t lookup = *reinterpret_cast<int32_t *>( + _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp new file mode 100644 index 000000000..93963a504 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the indices + * + * Validate that indices are not negative + * + * @param[in] indices Indices tensor info. + */ +template <typename U> void validate_indices(const ITensor *indices) +{ + for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i) + { + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0); + } +} + +} // namespace + +NEGatherKernelEx::NEGatherKernelEx() + : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} +{ +} + +template <typename U> +inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +template <typename U> +void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator output_it(_output, output_window); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank, _axis); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + _input = input; + _indices = indices; + _output = output; + _axis = axis; + _indices_rank = indices->info()->num_dimensions(); + + if (_axis < 0) + { + _axis += input->info()->num_dimensions(); + } + ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); + + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + + if (axis < 0) + { + axis += input->num_dimensions(); + } + + ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window, info); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp new file mode 100644 index 000000000..30787c0a4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <unordered_map> + +using namespace arm_compute; + +namespace +{ +constexpr size_t NOT_HIT = 0xFFFFFFFF; +} // namespace + +NEHashtableLookupKernel::NEHashtableLookupKernel() + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} +{ +} + +void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys, + const ITensor *input, ITensor *output, ITensor *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Auto initialize output if not initialized + auto out_shape{input->info()->tensor_shape()}; + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Auto initialize hits if not initialized + auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1)); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + // Validate in case of configured hits + if (hits->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1)); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + } + + return Status{}; +} + +void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + const int const_0 = _output->info()->data_type() == DataType::QASYMM8 + ? _output->info()->quantization_info().uniform().offset + : 0; + + std::unordered_map<int32_t, size_t> key_index_map; + for (size_t n = 0; n < _keys->info()->dimension(0); ++n) + { + const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n})); + key_index_map[key] = n; + } + std::vector<size_t> lookup_indices; + for (size_t k = 0; k < _lookups->info()->dimension(0); ++k) + { + const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k})); + const auto it = key_index_map.find(key); + if (it == key_index_map.end()) + { + lookup_indices.emplace_back(NOT_HIT); + *_hits->ptr_to_element({k}) = 0; + } + else + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= _keys->info()->dimension(0)) + ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices.emplace_back(it->second); + *_hits->ptr_to_element({k}) = 1; + } + } + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..49adf1462 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +template <typename T> +void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + const auto channel_idx = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + if (gamma != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), + "Gamma's size must be the same as size of input's channel"); + } + + if (beta != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), + "Beta's size must be the same as size of input's channel"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) +{ +} + +void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output, + ITensor *gamma, ITensor *beta, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + + if (_input->info()->data_type() == DataType::F32) + { + _func = &instance_normalization_nchw<float>; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (_input->info()->data_type() == DataType::F16) + { + _func = &instance_normalization_nchw<float16_t>; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + (*_func)(_input, _output, _gamma, _beta, _epsilon, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..b92130cec --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +inline int32x4x4_t load_value(const int32_t *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t ret = {{ + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + }}; + return ret; +} +} // namespace + +NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor, + ITensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier) +{ + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + + return Status{}; +} + +template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); +} + +void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_output->info()->data_type()) + { + case DataType::F32: + NEMultiplyScaleFactorKernel::multiply<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEMultiplyScaleFactorKernel::multiply<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp new file mode 100644 index 000000000..0a11eb509 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +namespace arm_compute +{ +namespace +{ +/** Validate the depth + * + * Validate that depth are not negative + * + * @param[in] depth Depth tensor. + * @param[in] output Output tensor. + * @param[in] axis Axis of depth. + */ +template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0); + ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) != + *(reinterpret_cast<U *>(depth->buffer()))); +} + +Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis || + actual_axis >= static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + } + + return Status{}; +} + +template <typename U, typename Enable = void> bool isOnValue(U) { return true; } + +template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0> +bool isOnValue(U index, U depth) +{ + return index >= 0 && index < depth; +} +} // namespace + +NEOneHotKernel::NEOneHotKernel() + : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1}, + _output{nullptr}, _func{} +{ +} + +template <typename U> +void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the depth are not negative + validate_depth<U>(_depth, _output, _axis); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator output_it(_output, output_window); + const U off_value = *reinterpret_cast<U *>(_off_value->buffer()); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + std::fill_n(output_it.ptr(), + _output->info()->dimension(0) * _output->info()->element_size(), off_value); + Coordinates indices_id(id); + indices_id.remove(0); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(0, new_index); + std::copy_n(_on_value->buffer(), _output->info()->element_size(), + _output->ptr_to_element(onehot_id)); + } + }, + output_it); +} + +template <typename U> +inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the indices are not negative + validate_depth<U>(_depth, _output, _axis); + Iterator output_it(_output, window); + execute_window_loop(window, + [&](const Coordinates &id) { + Coordinates indices_id(id); + indices_id.remove(_axis); + const U new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(_axis, new_index); + std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() + : _off_value->buffer(), + _output->info()->element_size(), output_it.ptr()); + } + }, + output_it); +} + +void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth, + const ITensor *on_value, const ITensor *off_value, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(), + off_value->info(), output->info(), axis)); + _indices = indices; + _depth = depth; + _on_value = on_value; + _off_value = off_value; + _output = output; + _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + INEKernel::configure(win); +} + +Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(indices, depth, on_value, off_value, output, axis)); + return Status{}; +} + +void NEOneHotKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + (this->*_func)(window, info); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..5841f1d69 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + return Status{}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float32x4_t round(const float32x4_t &fv) +{ + const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f); + const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f); + // If value < 0, mask = -1, else mask = 0 + int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4)); + return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4)); +} + +inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv); + const int32x4_t vposend = vdupq_n_s32(max_scale); + const int32x4_t vnagend = vdupq_n_s32(-max_scale); + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#else //__aarch64__ + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#endif //__aarch64__ + }}; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} +} // namespace + +NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) +{ +} + +void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output, + ITensor *scale_factor) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), scale_factor->info())); + + _input = input; + _output = output; + _scale_factor = scale_factor; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor)); + + return Status{}; +} + +template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window; + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + const auto dim_x = _input->info()->dimension(0); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); +} + +void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::F32: + NEQuantizationSymmetricKernel::quantize<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEQuantizationSymmetricKernel::quantize<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..863316909 --- /dev/null +++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Error.h" + +using namespace arm_compute; + +const std::pair<unsigned int, unsigned int> +arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + const unsigned int padx = info.pad_left() + info.pad_right(); + const unsigned int pady = info.pad_top() + info.pad_bottom(); + + ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); + ARM_COMPUTE_ERROR_ON(kernel_width <= padx); + ARM_COMPUTE_ERROR_ON(kernel_height <= pady); + + // Find the transpose conv out dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right; + const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom; + + return std::make_pair<unsigned int, unsigned int>(w, h); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp new file mode 100644 index 000000000..158fe0b0c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/CLFunctionsEx.h" + +// NOTE This empty file aims to validate "CLFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp new file mode 100644 index 000000000..267228eac --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/Utils.h" + +namespace arm_compute +{ +CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), + _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() +{ +} + +Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + const unsigned int num_of_stages = + calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + + DataType output_data_type = DataType::S32; + TensorInfo not_reshaped_output; + const auto input_num_channles = input->num_channels(); + const auto input_qinfo = input->quantization_info(); + + if (output->total_size() != 0) + { + output_data_type = output->data_type(); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, + false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); + } + + auto shape_before_reshape = input->tensor_shape(); + shape_before_reshape.set(axis, 1); + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, + int num_channels, QuantizationInfo qinfo) { + ti.set_data_type(data_type) + .set_tensor_shape(shape) + .set_num_channels(num_channels) + .set_quantization_info(qinfo); + }; + + initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, + input_num_channles, input_qinfo); + + if (num_of_stages == 1) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); + } + else + { + // Create temporary tensor infos + std::vector<TensorInfo> sums_vector(num_of_stages - 1); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (unsigned int i = 0; i < num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + sums_vector[i].set_data_type(input->data_type()); + sums_vector[i].set_tensor_shape(shape); + sums_vector[i].set_num_channels(input->num_channels()); + } + + // Validate ReductionOperation only on first kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); + + // Validate ReductionOperation on intermediate stages + for (unsigned int i = 1; i < num_of_stages - 1; ++i) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], + &sums_vector[i], axis, op)); + } + + // Validate ReductionOperation on the last stage + const unsigned int last_stage = num_of_stages - 1; + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( + input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); + } + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); + return Status{}; +} + +void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); + _reduction_axis = axis; + + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( + input->info()->tensor_shape(), axis, false); + DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) + ? DataType::S32 + : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + // Configure reduction operation kernels + _reduction_kernels_vector.resize(_num_of_stages); + + _memory_group.manage(&_not_reshaped_output); + // Create temporary tensors + if (_num_of_stages == 1) + { + // Force an early initialization for int64 output type + TensorShape output_shape{input->info()->tensor_shape()}; + output_shape.set(axis, 1); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + _not_reshaped_output.info()->set_tensor_shape(output_shape); + _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); + } + else + { + _results_vector.resize(_num_of_stages - 1); + TensorShape shape{input->info()->tensor_shape()}; + for (unsigned int i = 0; i < _num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + _results_vector[i].allocator()->init( + input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); + } + + // Apply ReductionOperation only on first kernel + _memory_group.manage(&_results_vector[0]); + _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op); + + // Apply ReductionOperation on intermediate stages + for (unsigned int i = 1; i < _num_of_stages - 1; ++i) + { + _memory_group.manage(&_results_vector[i]); + _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], + axis, op); + _results_vector[i - 1].allocator()->allocate(); + } + + // Apply ReductionOperation on the last stage + const unsigned int last_stage = _num_of_stages - 1; + _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], + &_not_reshaped_output, axis, op); + _results_vector[last_stage - 1].allocator()->allocate(); + } + _reshape_kernel.configure(&_not_reshaped_output, output); + _not_reshaped_output.allocator()->allocate(); +} + +void CLArgMinMaxLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _num_of_stages; ++i) + { + CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + } + CLScheduler::get().enqueue(_reshape_kernel, false); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..e5122ab8f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp new file mode 100644 index 000000000..c7d0ac8e2 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLCastBool.h" + +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" + +using namespace arm_compute; + +void CLCastBool::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp new file mode 100644 index 000000000..3dede0562 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <memory> +#include <tuple> + +namespace arm_compute +{ +using namespace arm_compute::misc::shape_calculator; + +CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( + std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _flip_axis(), + _is_prepared(false) +{ +} + +Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, + invalid_right, invalid_bottom, weights_info); +} + +void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, upsample_info); + + // Setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, + weights_info); + _scaled_output.allocator()->allocate(); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + _flip_axis.map(true); + auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); + if (weights->info()->data_layout() == DataLayout::NHWC) + { + axis_data[0] = 1; + axis_data[1] = 2; + } + else + { + axis_data[0] = 0; + axis_data[1] = 1; + } + _flip_axis.unmap(); +} + +void CLDirectTransposeConvLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _scale_f.run(); + _conv_f.run(); +} + +void CLDirectTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + // Free flipped weights + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..ae9d8afc6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..01989461e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" + +#include <algorithm> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_UNUSED(input); + ARM_COMPUTE_UNUSED(weights); + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = support::cpp14::make_unique<CLTransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return CLTransposeKernel::validate(input, output); +} + +CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), + _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), + _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), + _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), + _original_weights(nullptr) +{ +} +void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, + ICLTensor *output, bool retain_internal_weights) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_UNUSED(retain_internal_weights); + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _is_prepared = fc_info.retain_internal_weights; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.set_target(CLScheduler::get().target()); + _accumulate_biases_kernel.configure(output, biases); + } + + const ICLTensor *weights_to_use = weights; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool is_fc_after_conv = false; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv, + "CLFullyConnectedHybridLayer does not support after conv"); + ARM_COMPUTE_UNUSED(is_fc_after_conv); + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Extract scale factor + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); + _memory_group.manage(&_scale_factor); + _scale_factor_kernel.configure(input, &_scale_factor); + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + _memory_group.manage(&_quantized_input); + _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); + + // GEMMLowp + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + _memory_group.manage(&_gemmlowp_output); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output, + fc_info.retain_internal_weights); + _quantized_input.allocator()->allocate(); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().uniform().scale); + _gemmlowp_output.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + const GPUTarget gpu_target = CLScheduler::get().target(); + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1; + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv, + "CLFullyConnectedHybridLayer does not support after conv"); + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Validate Scale factor kernel + const ITensorInfo &scale_factor = + TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); + + // Validate quantization symm8 kernel + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate matrix multiply kernel + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + // Multiply scale + ARM_COMPUTE_RETURN_ON_ERROR( + CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); + + return Status{}; +} + +void CLFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Extract scale_factor + CLScheduler::get().enqueue(_scale_factor_kernel); + + // Quantize input + CLScheduler::get().enqueue(_quant_input_kernel); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + CLScheduler::get().enqueue(_multiply_scale_kernel); + + // Accumulate biases if provided + if (_accumulate_biases) + { + CLScheduler::get().enqueue(_accumulate_biases_kernel); + } +} + +void CLFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](CLTensor *w) { + if (!w->is_used()) + { + CLScheduler::get().queue().finish(); + w->allocator()->free(); + } + }; + + // Reshape of the weights if needed (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_kernel.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..2ff4b9659 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/Cast.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" + +#include <algorithm> + +namespace arm_compute +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::utils::cast; + +namespace +{ +Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, + const ITensorInfo &output, + GEMMLowpOutputStageInfo &gemmlowp_output_stage) +{ + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + // Configure output stage for quantized case + if (is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output.quantization_info().uniform(); + + const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info; + + const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( + multiplier, &output_multiplier, &output_shift)); + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_min_bound = 0; + gemmlowp_output_stage.gemmlowp_max_bound = 255; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, + const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if (is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset); + const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); + } + + return Status{}; +} +} // namespace + +void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = support::cpp14::make_unique<CLTransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return CLTransposeKernel::validate(input, output); +} + +CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), + _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), + _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), + _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), + _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) +{ +} +void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), + gemmlowp_output_stage); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info(QuantizationInfo( + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->info()->set_quantization_info(QuantizationInfo( + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info); + } +} + +void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init(input->info() + ->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(shape_flatten) + .set_data_layout(DataLayout::NCHW)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_layer.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, bias, output, fc_info); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void CLFullyConnectedLayerEx::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, bias, output, fc_info); +} + +void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_prepared = fc_info.retain_internal_weights; + _original_weights = weights; + + if (_weights_manager) + { + _weights_manager->manage(weights); + } + + const ICLTensor *weights_to_use = weights; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + if (_weights_manager && _weights_manager->are_weights_managed(weights)) + { + _reshape_weights_managed_function.configure(weights); + weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->acquire(weights, &_reshape_weights_managed_function)); + } + else + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + if (_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) + { + _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(), + fc_info.weights_trained_layout); + weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->acquire(weights, &_convert_weights_managed)); + } + else + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + } + _are_weights_converted = false; + } + + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, biases, output, fc_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, biases, output, fc_info); + } +} + +Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_input = TensorInfo(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(input)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR( + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); + + return Status{}; +} + +void CLFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + if (!_weights_manager) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + } + + // Pointer to current weights + const ICLTensor *cur_weights = _original_weights; + // Reshape of the weights + if (!_are_weights_reshaped) + { + if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + } + else + { + _reshape_weights_function.run(); + cur_weights = &_reshape_weights_output; + } + } + + // Convert weights if needed + if (!_are_weights_converted) + { + if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _weights_manager->run(cur_weights, &_convert_weights_managed); + } + else + { + _convert_weights.run(); + } + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + _flatten_layer.run(); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } +} + +void CLFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if(!_is_prepared) + { + if(!_weights_manager) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + } + + auto release_unused = [](CLTensor * w) + { + if(!w->is_used()) + { + CLScheduler::get().queue().finish(); + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ICLTensor *cur_weights = _original_weights; + + // Reshape of the weights if needed (happens only once) + if(!_are_weights_reshaped) + { + if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + { + cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + } + else + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + } + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if(!_are_weights_converted) + { + if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _weights_manager->run(cur_weights, &_convert_weights_managed); + } + else + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + cur_weights->mark_as_unused(); + } + + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..157b4d977 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input, + const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, + arm_compute::ICLTensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ICLTensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); + _cl_reshape.configure(_input, &_cl_buffer); + input_to_use = &_cl_buffer; + } + + _cl_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::CLFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS) + { + bool is_hybrid = (input->info()->data_type() == DataType::F32 || + input->info()->data_type() == DataType::F16) && + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::CLFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + else + { + throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); + } + + }(); + + if (_needs_reshape) + { + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } +} + +void CLFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _cl_reshape.run(); + + _cl_fc->run(); +} + +void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp new file mode 100644 index 000000000..e0b833b04 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLGatherEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +using namespace arm_compute; + +void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) +{ + auto k = support::cpp14::make_unique<CLGatherExKernel>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return CLGatherExKernel::validate(input, indices, output, axis); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..65b89a389 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..5a7e40839 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..28e5bc0da --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp new file mode 100644 index 000000000..aa9f32ec6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLOneHot.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" +namespace arm_compute +{ +CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis) +{ + _has_to_memset = true; + _memset_kernel.configure(output, off_value); + _onehot_kernel.configure(indices, on_value, output, depth, axis); +} +Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::run() +{ + if (_has_to_memset) + { + CLScheduler::get().enqueue(_memset_kernel, true); + } + + CLScheduler::get().enqueue(_onehot_kernel, false); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..02ee4ad8a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReductionOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1); + + // Create temporary tensor infos + auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1, false); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + interm_tensors[i].set_data_layout(input->data_layout()); + interm_tensors[i].set_quantization_info(input->quantization_info()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + if (!keep_dims) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, bool keep_dims, + ReductionOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); + + _axis = axis; + + _input = input; + _output = output; + _keep_dims = keep_dims; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + if (num_of_kernels < 1) + { + throw std::runtime_error("CLReduceOperation: there is no axis to reduce"); + } + + _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReductionOperation on all kernels + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + shape.set(*it, 1, false); + if (!keep_dims || i != (num_of_kernels - 1)) + { + _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _memory_group.manage(&_interm_tensors[i]); + } + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + if (i != 0) + { + _interm_tensors[i - 1].allocator()->allocate(); + } + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output); + _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate(); + } +} + +void CLReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp new file mode 100644 index 000000000..a502f032e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLSplitVEx.h" +#include "support/ToolchainSupport.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include <cassert> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs, + unsigned int num_splits) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1, + "size_splits must be a 1-D tensor."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(), + "Number of output tensors does not match number of splits."); + return Status{}; +} + +Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, + uint32_t split_dim) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2); + + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + unsigned int axis_offset = 0; + // Validate output tensors + for (const auto &output : outputs) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + // Get output shape + const TensorShape output_shape = output->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->clone(); + auto_init_if_empty(tmp_output_info, + input->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords)); + + axis_offset += axis_split_step; + } + + return Status{}; +} + +void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, + std::vector<CLSlice> &_slice_functions, uint32_t split_dim) +{ + unsigned int axis_offset = 0; + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + int out_iter = 0; + for (const auto &output : outputs) + { + const TensorShape output_shape = output->info()->tensor_shape(); + auto op_size = output_shape.total_size(); + if (!op_size) + { + continue; + } + + assert(op_size != 0); + assert(split_dim <= output_shape.num_dimensions()); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->info()->clone(); + auto_init_if_empty( + tmp_output_info, + input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + // Configure slice function + _slice_functions[out_iter].configure(input, output, start_coords, end_coords); + + // Set valid region from shape + outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape)); + axis_offset += axis_split_step; + } +} + +} // namespace + +CLSplitVEx::CLSplitVEx() + : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() +{ +} + +void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits)); + + _input = input; + _size_splits = size_splits; + _outputs = outputs; + _num_splits = num_splits; + + // Create tensor slices + _slice_functions.resize(_num_splits); + + // Extract output tensor info + std::vector<ITensorInfo *> outputs_info; + for (auto &output : _outputs) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + outputs_info.emplace_back(output->info()); + } + + // Validate slices + ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim)); + + // Configure slices + configure_slices(_input, _outputs, _slice_functions, split_dim); +} + +void CLSplitVEx::run() +{ + // execute the slices + for (unsigned i = 0; i < _outputs.size(); ++i) + { + _slice_functions[i].run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..3ac95a8e6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else +#endif // Disable GPU implementation + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else +#endif + { + run_on_cpu(); + } +} + +#if 0 +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} +#endif // Disable GPU implementation + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp new file mode 100644 index 000000000..3215d01a7 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cmath> +#include <memory> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_manager(std::move(memory_manager)), _function() +{ +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, + invalid_right, invalid_bottom, weights_info); +} + +void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, + ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, + output->info(), deconv_info, invalid_right, + invalid_bottom, weights_info)) + { + case DeconvolutionMethod::DIRECT: + { + auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>(); + f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, + invalid_bottom, weights_info); + _function = std::move(f); + break; + } + case DeconvolutionMethod::GEMM: + { + auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); + f->configure(compile_context, input, weights, bias, output, deconv_info); + _function = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } +} + +Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + switch (CLTransposeConvLayer::get_deconvolution_method( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + { + case DeconvolutionMethod::DIRECT: + { + // Validate direct convolution layer + ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + break; + } + case DeconvolutionMethod::GEMM: + { + // Validate gemm-based convolution layer + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_UNUSED(output, bias, weights_info); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 || + invalid_bottom != 0) + { + return DeconvolutionMethod::DIRECT; + } + + return DeconvolutionMethod::GEMM; +} + +void CLTransposeConvLayer::run() +{ + prepare(); + _function->run(); +} + +void CLTransposeConvLayer::prepare() { _function->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp new file mode 100644 index 000000000..80fbf359d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/NEFunctionsEx.h" + +// NOTE This empty file aims to validate "NEFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp new file mode 100644 index 000000000..2fc94b267 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" +#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> + +#include "arm_compute/core/ITensor.h" +#include "support/MemorySupport.h" + +#include <utility> + +namespace arm_compute +{ + +template <BinaryLogicalOperation COP> +void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, + ITensor *output) +{ + auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); +} + +template <BinaryLogicalOperation COP> +Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); +} + +void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) +{ + auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); +} + +Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op) +{ + return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); +} + +// Supported Specializations +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp new file mode 100644 index 000000000..6ad3e1b12 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NECastBool.h" + +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NECastBool::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NECastBoolKernel::validate(input, output); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp new file mode 100644 index 000000000..e0ab3e025 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) +{ + auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..a123439d9 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) +{ + auto k = support::cpp14::make_unique<NETransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return NETransposeKernel::validate(input, output); +} + +NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) +{ +} + +void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool _is_fc_after_conv; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, + "NEFullyConnectedHybridLayer does not support after conv"); + (void)_is_fc_after_conv; + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); + + // GEMM + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().uniform().scale); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + + _quantized_input.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + _gemmlowp_output.allocator()->allocate(); +} + +Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( + &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); + + return Status{}; +} + +void NEFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Quantize input + NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY); + + // Accumulate biases if provided + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } +} + +void NEFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..cb7557a5a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + if (is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().uniform().scale, + -input.quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().uniform().scale, + -weights.quantization_info().uniform().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( + &input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + } + + return Status{}; +} +} // namespace + +NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), + _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), + _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), + _accumulate_biases(false), _is_quantized(false), _is_prepared(false) +{ +} + +void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info(QuantizationInfo( + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->info()->set_quantization_info(QuantizationInfo( + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + } +} + +void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + shape_flatten)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_kernel.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, output); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, output); +} + +void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; + + // Configure gemmlowp output + if (_is_quantized) + { + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::S32)); + } + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !_is_quantized) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + _are_weights_converted = false; + } + + ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, tmp_output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if (_is_quantized) + { + float multiplier = input->info()->quantization_info().uniform().scale * + weights->info()->quantization_info().uniform().scale / + output->info()->quantization_info().uniform().scale; + int output_multiplier; + int output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, + &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, + output_shift, + output->info()->quantization_info().uniform().offset); + _gemmlowp_output.allocator()->allocate(); + } + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); + + // Validate output stage for asymmetric quantized types + if (is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( + &gemmlowp_output, biases, output)); + } + + return Status{}; +} + +void NEFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Reshape of the weights + if (!_are_weights_reshaped) + { + _reshape_weights_function.run(); + } + + // Convert weights if needed + if (!_are_weights_converted) + { + _convert_weights.run(); + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } + + // Accumulate biases if provided + if (_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else + { + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } + } +} + +void NEFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ITensor *cur_weights = _original_weights; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if (!_are_weights_converted) + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + + cur_weights->mark_as_unused(); + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..dc6c78478 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input, + const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, + arm_compute::ITensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ITensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _neon_reshape.configure(_input, &_neon_buffer); + input_to_use = &_neon_buffer; + } + + _neon_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + }(); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + if (_needs_reshape) + { + _neon_buffer.allocator()->allocate(); + } +} + +void NEFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _neon_reshape.run(); + + _neon_fc->run(); +} + +void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp new file mode 100644 index 000000000..433c35d58 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +#include "support/MemorySupport.h" + +#include <utility> + +namespace arm_compute +{ +void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) +{ + auto k = support::cpp14::make_unique<NEGatherKernelEx>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return NEGatherKernelEx::validate(input, indices, output, axis); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp new file mode 100644 index 000000000..52d58accf --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) +{ + auto k = support::cpp14::make_unique<NEHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} + +Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..16d74e62d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() +{ +} + +void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma, + ITensor *beta, float epsilon) +{ + const DataLayout data_layout = input->info()->data_layout(); + + // Configure Kernels + _is_nchw = data_layout == DataLayout::NCHW; + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + + _permute_output.configure(&_permuted_output, output != nullptr ? output : input, + PermutationVector(2U, 0U, 1U)); + _permuted_input.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + _normalization_kernel.configure(input, output, gamma, beta, epsilon); + } +} + +Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return NEInstanceNormalizationLayerKernelEx::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); +} + +void NEInstanceNormalizationLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp new file mode 100644 index 000000000..275c55024 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEOneHot.h" +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" +#include "support/MemorySupport.h" +#include <utility> +namespace arm_compute +{ +void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>(); + k->configure(indices, depth, on_value, off_value, output, axis); + _kernel = std::move(k); +} +Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp new file mode 100644 index 000000000..cb1a26304 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Tensor.h" + +using namespace arm_compute; + +NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReductionOperation op) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], op); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..26a887912 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp new file mode 100644 index 000000000..aa165cc15 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ + +NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), + _scaled_output(), + _weights_flipped(), + _flip_axis(), + _original_weights(nullptr), + _input(nullptr), + _info(), + _is_prepared(false) +{ +} + +Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + } + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); + } + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != + scale_out_info.dimension(batches_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != + scale_out_info.dimension(channel_idx)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, WeightsInfo())); + + return Status{}; +} + +void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) +{ + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + const DataLayout data_layout = input->info()->data_layout(); + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _memory_group.manage(&_scaled_output); + + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); + axis_data[0] = static_cast<uint32_t>(width_idx); + axis_data[1] = static_cast<uint32_t>(height_idx); + + _scaled_output.allocator()->allocate(); +} + +void NETransposeConvLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _upsample_f.run(); + _conv_f.run(); +} + +void NETransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..f94effea1 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ +template <typename T> class TopContainer +{ +public: + /** + * @brief Prevent default constructor of of this class + */ + TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ + TopContainer(const TopContainer &) = delete; + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ + TopContainer &operator=(const TopContainer &) = delete; + + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt new file mode 100644 index 000000000..5ea6cdadd --- /dev/null +++ b/compute/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectories() diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt new file mode 100644 index 000000000..09f67259c --- /dev/null +++ b/compute/cker/CMakeLists.txt @@ -0,0 +1,19 @@ +nnfw_find_package(Eigen REQUIRED) +nnfw_find_package(GEMMLowp REQUIRED) +nnfw_find_package(Ruy REQUIRED) + +add_library(nnfw_lib_cker INTERFACE) +target_link_libraries(nnfw_lib_cker INTERFACE eigen) +target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp) +target_link_libraries(nnfw_lib_cker INTERFACE ruy) +target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation) +target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV) +if(PROFILE_RUY) + target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler) +endif(PROFILE_RUY) + +target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) + +# Workaround to avoid warning +# TODO Resolve warning +target_compile_options(nnfw_lib_cker INTERFACE -Wno-attributes) diff --git a/compute/cker/README.md b/compute/cker/README.md new file mode 100644 index 000000000..3d98362ab --- /dev/null +++ b/compute/cker/README.md @@ -0,0 +1,7 @@ +# cker + +cker - CPU kernel library + +__cker__ means `CPU kernel` + +Current __cker__ is porting of Tensorflow lite's operation kernel diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h new file mode 100644 index 000000000..e08040632 --- /dev/null +++ b/compute/cker/include/cker/NeonTensorUtils.h @@ -0,0 +1,977 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_NEON_TENSOR_UTILS_H__ +#define __NNFW_CKER_NEON_TENSOR_UTILS_H__ + +#include <ruy/path.h> +#include <ruy/ruy.h> +#include "cker/Types.h" +#include "cker/neon/neon_check.h" +#include "cker/ruy/RuySupport.h" +#include "util/logging.h" +#if defined __linux__ && defined __aarch64__ +#include <sys/auxv.h> +#endif + +#include <cassert> +#include <cmath> + +#ifdef USE_NEON + +#define kFloatWeightsPerNeonLane 4 + +namespace nnfw +{ +namespace cker +{ + +namespace +{ + +constexpr int kFloatValuesPerNeonVector = 4; + +// TODO(ahentz): Clean up. +using int8 = std::int8_t; +using uint8 = std::uint8_t; +using int16 = std::int16_t; +using uint16 = std::uint16_t; +using int32 = std::int32_t; +using uint32 = std::uint32_t; + +template <int PerNeonSize> inline int RoundDownVectors(int size) +{ + return size & ~(PerNeonSize - 1); +} + +// Allocates, at least, size bytes of uninitialized storage whose alignment is +// specified by alignment. The size parameter must be an integral multiple of +// alignment. +// Caller is responsible by freeing the allocated memory by calling free on +// the passed freeing_buffer pointer. +void *aligned_alloc(size_t alignment, size_t size, void **freeing_buffer) +{ + *freeing_buffer = malloc(size + alignment); + const size_t offset = ((uintptr_t)*freeing_buffer) % alignment; // NOLINT + return offset == 0 ? *freeing_buffer : ((char *)*freeing_buffer + (alignment - offset)); // NOLINT +} + +inline int32_t AccumulateNeonLane(const int32x4_t lane) +{ +#ifdef __aarch64__ + return vaddvq_s32(lane); +#else + int64x2_t pairwiseAdded = vpaddlq_s32(lane); + return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1); +#endif +} + +} // namespace + +// The implementation of dotprod detection is copied from ruy's internal +// function DetectDotprod(). +// At the moment it's only implemented on Linux ARM64. Consider syncing again +// with ruy in the future to share improvements. +#if defined __linux__ && defined __aarch64__ +inline bool DetectDotprodByLinuxAuxvMethod() +{ + // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers, + // however we need to support building against older headers for the time + // being. + const int kLocalHwcapAsimddp = 1 << 20; + return getauxval(AT_HWCAP) & kLocalHwcapAsimddp; +} +#endif + +inline bool DetectArmNeonDotprod() +{ +#if defined __linux__ && defined __aarch64__ + return DetectDotprodByLinuxAuxvMethod(); +#endif + + return false; +} + +inline bool HasSdotInstruction() +{ + static const bool has_dotprod = DetectArmNeonDotprod(); + return has_dotprod; +} + +#ifdef __aarch64__ +// We interleave vector data to make the dot product logic more efficient. +// Suppose that vectors is: +// a0 a1 a2 a3 a4 a5 ... +// b0 b1 b2 b3 b4 b5 ... +// c0 c1 c2 c3 c4 c5 ... +// d0 d1 d2 d3 d4 d5 ... +// e0 e1 e2 e3 e4 e5 ... +// This code interleaves them like this: +// a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ... +// e0 e1 e2 e3 f0 f1 f2 f3 ... +// Once the data is interleaved, each 16-byte read from the vectors pointer +// contains 4 bytes from each of 4 vectors. +inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols, + void **shuffled_vectors_free) +{ + const int kWeightsPerUint32 = 4; + + int8 *shuffled_vectors = reinterpret_cast<int8 *>( + aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free)); + + for (int i = 0; i < n_batch; i += 4) + { + int8 *shuffled_vectors_ptr = shuffled_vectors + (i * m_cols); + const int8 *unshuffled_vec0_ptr = reinterpret_cast<const int8 *>(vectors) + (i * m_cols); + const int8 *unshuffled_vec1_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 1) * m_cols); + const int8 *unshuffled_vec2_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 2) * m_cols); + const int8 *unshuffled_vec3_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 3) * m_cols); + const int8 *const end_vec0_ptr = unshuffled_vec1_ptr; + + while (unshuffled_vec0_ptr != end_vec0_ptr) + { + asm volatile( + // This code path requires that (n_cols % 16) == 0 so we can safely + // read in 16-byte chunks from each row. + "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n" + "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n" + "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n" + "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n" + + "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n" + + : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr), + [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr), + [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr), + [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr), + [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr) + : + : "v0", "v1", "v2", "v3", "cc", "memory"); + } + } + + return reinterpret_cast<const int8_t *>(shuffled_vectors); +} + +// Notes about the speed of this version vs. the baseline (from memory): +// - With 256K of L1, we can keep a lot of vectors in cache. +// I recall a reasonable speedup just by rearranging the loop to have +// row on the outside and batch on the inside. +// - I also recall getting a nice speedup from sdot. +// - I tried many times to do better than the current implementation, using +// loop unrolling and instruction reordering to avoid stalls, etc. +// but I was not able to do significantly better. This code is, however, +// much worse than what the processor spec sheet suggests is possible. +static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *vectors, + const float *scaling_factors, + int n_batch, float *__restrict__ result) +{ + void *shuffled_vectors_free; + + const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free); + + for (int row = 0; row < m_rows; row += 2) + { + for (int batch = 0; batch < n_batch; batch += 4) + { + float *result_ptr = result + (batch * m_rows) + row; + const int8 *mat_ptr0 = matrix + (row * m_cols); + const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols); + const int8 *mat_ptr0_end = mat_ptr1; + const int8 *vec_ptr = shuffled_vectors + (batch * m_cols); + const float *scaling_factors_ptr = scaling_factors + batch; + const uint64_t wide_rows = m_rows * sizeof(float); + const int8 *mat_ptr2 = matrix + ((row + 2) * m_cols); + const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols); + + asm volatile( + // Zero out the accumulator registers. + "dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + + "1:\n" // batch_cols_loop + + // Read 16 more bytes from a pair of matrix rows. + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + + // Prefetch two rows ahead. + "prfm pldl1strm, [%[mat_ptr2]]\n" + "prfm pldl1strm, [%[mat_ptr3]]\n" + + // Read from input vectors 4 times; 64 bytes total. + // Each 16-byte register contains parts of 4 vectors; see the + // shuffle logic above. + + // From Benoit, places to look in the future: + // - Move load instructions further from sdot + // - Switch loop use-then-reload + // - Do partial unrolling to use register space better + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + + // Update prefetch pointers. + "add %[mat_ptr2], %[mat_ptr2], #16\n" + "add %[mat_ptr3], %[mat_ptr3], #16\n" + + // Re-use those vectors for the next row as well. + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + + // If we're not done with these rows, continue. + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 1b\n" // batch_cols_loop + + // Done with the rows, sum the results. + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + // Convert the per-vector sums to floating point. + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Fetch scale factors. + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + + // Multiply scale factors times sums. + "fmul v0.4s, v4.4s, v0.4s\n" + "fmul v1.4s, v4.4s, v1.4s\n" + + // Load previous result values. + // The result position is: + // result[batch * m_rows + row] + // Here that is factored into: + // result_ptr = result + row + // *result_ptr = res[0] + // (uint8*)result_ptr += (m_rows * sizeof(float)) + // *result_ptr = res[1] + // ... + // Since we're reading two rows at a time, though, we read both + // result[batch * m_rows + row] + // and + // result[batch * m_rows + row + 1] + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + + // Go back to the starting position (subtract wide_rows * 4). + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + + // Add previous result values. + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + + // Store results. + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), + [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3) + : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr), + [wide_rows] "r"(wide_rows) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "cc", "memory"); + } + } + + free(shuffled_vectors_free); +} + +static void DotprodMatrixBatchFourVectorMultiplyAccumulate( + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) +{ + void *shuffled_vectors_free; + const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free); + + for (int row = 0; row < m_rows; row += 2) + { + const float *channel_scales_ptr = per_channel_scale + row; + int32_t *row_sums_ptr = row_sums ? row_sums + row : nullptr; + for (int batch = 0; batch < n_batch; batch += 4) + { + float *result_ptr = result + (batch * m_rows) + row; + const int8 *mat_ptr0 = matrix + (row * m_cols); + const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols); + const int8 *mat_ptr0_end = mat_ptr1; + const int8 *vec_ptr = shuffled_vectors + (batch * m_cols); + const float *scaling_factors_ptr = scaling_factors + batch; + const uint64_t wide_rows = m_rows * sizeof(float); + const int32_t *batch_offsets_ptr = input_offset + batch; + const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr; + const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr; + asm volatile("dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + // Load zero points. + "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + // Zero out zero point accumulators. + "dup v14.4s, wzr\n" + "dup v15.4s, wzr\n" + + // Load per channel scales if not null. + "cmp %w[is_channel_scale_nullptr], #0\n" + "bne 1f\n" + "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n" + "ld1r {v17.4s}, [%[channel_scales_ptr]]\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v4.4s\n" + "b 2f\n" + "1:\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "2:\n" + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 3f\n" + // Accumulate row_sums for zero point calculations. + "saddlp v12.8h, v12.16b\n" + "saddlp v13.8h, v13.16b\n" + "sadalp v14.4s, v12.8h\n" + "sadalp v15.4s, v13.8h\n" + "3:\n" + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 2b\n" + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 4f\n" + // Calculate zero point offsets. + "addv s14, v14.4s\n" + "addv s15, v15.4s\n" + "dup v14.4s, v14.s[0]\n" + "dup v15.4s, v15.s[0]\n" + "b 5f\n" + "4:\n" + "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n" + "ld1r {v15.4s}, [%[row_sums_ptr]]\n" + "5:\n" + + "mul v14.4s, v14.4s, v7.4s\n" + "mul v15.4s, v15.4s, v7.4s\n" + "sub v0.4s, v0.4s, v14.4s\n" + "sub v2.4s, v2.4s, v15.4s\n" + + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Multiply scale. + "fmul v0.4s, v16.4s, v0.4s\n" + "fmul v1.4s, v17.4s, v1.4s\n" + + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), + [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr) + : [mat_ptr0_end] "r"(mat_ptr0_end), + [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows), + [channel_scales_ptr] "r"(channel_scales_ptr), + [batch_offsets_ptr] "r"(batch_offsets_ptr), + [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr), + [is_row_sums_nullptr] "r"(is_row_sums_nullptr) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory"); + } + } + + free(shuffled_vectors_free); +} + +// The DotprodMatrixBatchFourVectorMultiplyAccumulate kernel processes 4 +// vectors in the same time as the baseline processes 1 vector. However, it +// requires 4 vectors of input. +// +// To take advantage of this speed difference, we add some zero-valued +// vectors to the batch so that n_batch is a multiple of 4. Then we execute +// DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate on that padded batch, +// then extract just the results we want at the end (ignoring the extra padding +// outputs). +// +// The relative cost of the padding is large when the matrix is smaller than +// 128x128, so we don't use this code path on small matrices. On larger +// matrices, the computation cost dwarfs the padding cost, making this code +// viable. +// +// If we ignore the cost of padding, this kernel is: +// 1x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 1 +// 2x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 2 +// 3x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 3 +// ... +// +// We don't use this kernel when n_batch = 1 because the baseline kernel +// is fine for that case. +inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) +{ + const int kWeightsPerUint32 = 4; + + // Round to the nearest multiple of 4. + int batch_round_up = n_batch; + if (n_batch % 4 != 0) + { + batch_round_up += (4 - n_batch % 4); + } + assert(n_batch <= batch_round_up); + + void *padded_vectors_free; + const int padded_vectors_size = batch_round_up * m_cols; + int8_t *padded_vectors = reinterpret_cast<int8_t *>( + aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free)); + memset(padded_vectors, 0, padded_vectors_size); + + void *padded_result_free; + const int result_size = n_batch * m_rows * sizeof(float); + const int padded_result_size = batch_round_up * m_rows * sizeof(float); + float *padded_result = reinterpret_cast<float *>( + aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free)); + memcpy(padded_result, result, result_size); + memset(reinterpret_cast<char *>(padded_result) + result_size, 0, + padded_result_size - result_size); + + // Copy the input into the padded data structure. + assert(n_batch * m_cols <= padded_vectors_size); + memcpy(padded_vectors, vectors, n_batch * m_cols); + + void *padded_scaling_factors_free; + const int padded_scaling_factors_size = batch_round_up * sizeof(float); + float *padded_scaling_factors = reinterpret_cast<float *>( + aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free)); + assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size); + assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size); + memset(padded_scaling_factors, 0, batch_round_up * sizeof(float)); + memcpy(padded_scaling_factors, scaling_factors, n_batch * sizeof(float)); + + if (input_offset != nullptr) + { + void *padded_input_offset_free; + const int padded_input_offset_size = batch_round_up * sizeof(int32_t); + int32_t *padded_input_offset = reinterpret_cast<int32_t *>( + aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free)); + assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size); + assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size); + memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t)); + memcpy(padded_input_offset, input_offset, n_batch * sizeof(int32_t)); + + // Call the main kernel. + DotprodMatrixBatchFourVectorMultiplyAccumulate( + matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, + padded_result, per_channel_scale, padded_input_offset, row_sums); + + free(padded_input_offset_free); + } + else + { + // Call the main kernel. + DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, padded_vectors, + padded_scaling_factors, batch_round_up, + padded_result); + } + memcpy(result, padded_result, result_size); + + free(padded_result_free); + free(padded_vectors_free); + free(padded_scaling_factors_free); +} + +inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result) +{ + DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, + /*row_sums=*/nullptr); +} +#endif // __aarch64__ + +inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value) +{ + const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value); + const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value); + + int i = 0; + for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector) + { + // Load from memory to vector. + float32x4_t v_f32x4 = vld1q_f32(vector + i); + // Clip between clipping_value and -clipping_value. + v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4); + v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4); + // Save to output. + vst1q_f32(vector + i, v_f32x4); + } + for (; i < v_size; i++) + { + vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value); + } +} + +inline bool NeonIsZeroVector(const float *vector, int v_size) +{ + // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot + // use the main vectorized loop, and we need to process sequentially. + // postamble_start shows the start index where this should happen. + const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + const float32x4_t zero_x4_float = vmovq_n_f32(0.0f); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) + { + const float32x4_t i_x4_float = vld1q_f32(vector + v); + uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float); + if (vgetq_lane_u32(cmp_result, 0) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 1) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 2) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 3) == 0) + return false; + } + + // Postamble loop + for (int v = postamble_start; v < v_size; ++v) + { + if (vector[v] != 0.0) + return false; + } + return true; +} + +inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, + const int8_t *input_to_gate_weights, int32_t n_batch, + int32_t n_input, int32_t n_output, int32_t, int32_t *scratch, + ruy::Context *ruy_context) +{ + MatrixParams<int8_t> lhs_params; + lhs_params.order = Order::kRowMajor; + lhs_params.rows = n_output; + lhs_params.cols = n_input; + lhs_params.cache_policy = CachePolicy::kAlwaysCache; + + MatrixParams<int8_t> rhs_params; + rhs_params.order = Order::kColMajor; + rhs_params.rows = n_input; + rhs_params.cols = n_batch; + + MatrixParams<int32_t> dst_params; + dst_params.order = Order::kColMajor; + dst_params.rows = n_output; + dst_params.cols = n_batch; + + GemmParams<int32_t, int32_t> gemm_params; + if (bias) + { + gemm_params.bias = bias; + } + + // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy + ruy::Matrix<int8_t> ruy_lhs; + ruy::Matrix<int8_t> ruy_rhs; + ruy::Matrix<int32_t> ruy_dst; + // Note that cache is always enabled for input and weight tensors + ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true); + ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true); + ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst); + + ruy::BasicSpec<int32_t, int32_t> ruy_mul_params; + ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params); + + ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst); +} + +inline void NeonSub1Vector(const float *vector, int v_size, float *result) +{ + // If v_size is not divisible by the vector size, then we need to process the + // final few elements sequentially. postamble_start shows the start index + // where this should happen. + const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size); + + float32x4_t one_f32x4 = vmovq_n_f32(1.0); + int v = 0; + for (; v < postamble_start; v += kFloatValuesPerNeonVector) + { + // Load 4 float values from the current pointers of the input column and + // subtract from 1. + float32x4_t v_f32x4 = vld1q_f32(vector + v); + float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); + // Save to output. + vst1q_f32(result + v, result_f32x4); + } + for (; v < v_size; v++) + { + result[v] = 1.0f - vector[v]; + } +} + +inline void NeonSymmetricQuantizeFloats(const float *values, const int size, + int8_t *quantized_values, float *min, float *max, + float *scaling_factor) +{ + // TODO(raziel): vectorize min/max calculation. + auto minmax = std::minmax_element(values, values + size); + *min = *minmax.first; + *max = *minmax.second; + const int kScale = 127; + const float range = std::max(std::abs(*min), std::abs(*max)); + if (range == 0) + { + memset(quantized_values, 0, size * sizeof(int8_t)); + *scaling_factor = 1; + return; + } + *scaling_factor = range / kScale; + const float scaling_factor_inv = kScale / range; + + const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1)); + + // Vectorized constants. + const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv); + const float32x4_t point5_f32x4 = vmovq_n_f32(0.5); + const float32x4_t zero_f32x4 = vmovq_n_f32(0.0); + const int32x4_t scale_i32x4 = vmovq_n_s32(kScale); + const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale); + + for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) + { + // Implements the vectorized version of the following: + // const int32_t quantized_value = static_cast<int32>( + // std::round(*scaling_factor * values[i])); + // Since the vectorized round intrinsics (vrndqa_f32) is not supported + // on all Neon flavors, we use the following method for rounding: if (x + // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5) + float32x4_t value0_f32x4 = vld1q_f32(&values[i]); + float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]); + float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4); + float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4); + + int32x4_t cmp_with_zero0_ui32x4 = (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4); // NOLINT + int32x4_t cmp_with_zero1_ui32x4 = (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4); // NOLINT + + float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4); + float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4); + cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4); + cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4); + + mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4); + mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4); + + int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4); + int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4); + + // Implements the vectorized version of the folowing block: + // quantized_values[i] = std::min(kScale, std::max(-kScale, + // quantized_value)); + int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4); + int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4); + int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4); + int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4); + + int16x4_t min0_16x4 = vmovn_s32(min0_i32x4); + int16x4_t min1_16x4 = vmovn_s32(min1_i32x4); + + int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4); + int8x8_t min_s8x8 = vqmovn_s16(min_16x8); + vst1_s8(&quantized_values[i], min_s8x8); + } + + for (int i = postamble_start; i < size; ++i) + { + const int32_t quantized_value = + static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); + quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); + } +} + +inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + float *__restrict__ result, int result_stride) +{ +#ifdef __aarch64__ + if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch) + { + if (n_batch % 4 == 0 && result_stride == 1) + { + // Benchmarks suggest that it's always better to use the batch code + // when we can, even on small matrices. + DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, + scaling_factors, n_batch, result); + return; + } + else if (result_stride == 1 && n_batch >= 2 && m_rows * m_cols >= 128 * 128) + { + DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, + scaling_factors, n_batch, result); + return; + } + } +#endif // __aarch64__ + + static const int kWeightsPerUint32 = 4; + static const int kWeightsPerNeonLane = 16; + // Assuming *matrix is kWeightsPerUint32-byte aligned, + // every row of the matrix is also + // kWeightsPerUint32-byte aligned as long as cols is + // a multiple of kWeightsPerUint32. The assumption + // is currently satisfied by TFLite's 16-byte memory + // alignment scheme. + // + // Otherwise, we allocate an aligned memory block and set + // a flag to later copy rows from matrix to the block + // for aligned multiplication. + bool unaligned = false; + int8_t *aligned_row = nullptr; + void *aligned_row_free = nullptr; + if ((m_cols & (kWeightsPerUint32 - 1)) != 0) + { + unaligned = true; + aligned_row = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT + &aligned_row_free); + } + void *aligned_vec_free = nullptr; + int8_t *aligned_vec = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT + &aligned_vec_free); + + // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_half_start + // shows the start index where this should happen. Between postamble_start and + // postamble_half_start we can still process kWeightsPerNeonLane >> 1 in a + // vectorized form. + const int postamble_half_start = m_cols & ~(kWeightsPerNeonLane - 1); + const int postamble_start = m_cols & ~((kWeightsPerNeonLane >> 1) - 1); + + for (int batch = 0; batch < n_batch; ++batch) + { + const float batch_scaling_factor = scaling_factors[batch]; + // Copy the vector data to an aligned vector. + memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols); + // Compute dot-product for every column. + for (int row = 0; row < m_rows; ++row, result += result_stride) + { + // Get the address of the first element of the row. + int8_t *row_ptr = (int8_t *)matrix + row * m_cols; // NOLINT + if (unaligned) + { + memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols); + row_ptr = aligned_row; + } + + // Initialize the dot product sum for the row to 0. + int32x4_t dotprod_32x4 = vmovq_n_s32(0); + + // Prefetch the row to cache. + __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */); + + // For every block of 16 8-bit elements. + int col = 0; + for (; col < postamble_half_start; col += kWeightsPerNeonLane) + { + // Load 16 8-bit values from the row and vector, each, to operate on. + // Here the assumption is that each buffer is 4-byte aligned. Otherwise, + // performance may suffer significantly. + assert( // NOLINT + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col)); + const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col)); + // Multiply the low bits (i.e. the lower 8 8bit numbers in the + // registers). + int16x8_t prod_16x8 = vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16)); + // Multiply the high bits (i.e. the higher 8 8bit numbers in the + // registers), and accumulate with the result of the low bits product. + // The assumption here is that overflow will not happen as we quantize + // our values to be in the range [-127, 127]. As such the sum of the 2 + // products is always strictly smaller than 15-bits (32767 in absolute + // value). + prod_16x8 = vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16)); + + dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8); + } // for col + + // Half iteration dealing only 8 elements + // TODO(raziel): if (ABSL_PREDICT_FALSE(col < postamble_start)) + if (col < postamble_start) + { + // Load 8 8-bit values from the row and column each to operate on. + // Here the assumption is that each buffer is 4-bytes aligned. + // Otherwise, performance may suffer significantly. + assert( // NOLINT + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col)); + const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col)); + const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8); + dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8); + col += (kWeightsPerNeonLane >> 1); + } + // Add the 4 intermediate sum values to get the final dot-prod value for + // this row. + int32_t dotprod = AccumulateNeonLane(dotprod_32x4); + // Postamble loop. + // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols)) + for (; col < m_cols; ++col) + { + dotprod += row_ptr[col] * aligned_vec[col]; + } // for col + + *result += dotprod * batch_scaling_factor; + } // for row + } // for batch + + if (unaligned) + { + free(aligned_row_free); + } + free(aligned_vec_free); +} + +inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, float *result, + int result_stride) +{ + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); + + for (int b = 0; b < n_batch; b++) + { + float *result_in_batch = result + b * m_rows * result_stride; + const float *vector_in_batch = vector + b * m_cols; + const float *matrix_row = matrix; + + // Main matrix by vector multiplication loop + for (int r = 0; r < m_rows; r++) + { + float32x4_t acc_32x4 = vmovq_n_f32(0.0); + for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) + { + // Load 4 float values from vector and matrix row. + float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c); + float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c); + // Multiply the vector and matrix row and add to accumulator. + acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4); + } + // Add the 4 intermediate sum values to get the final dot-prod value for + // this column. + *result_in_batch += (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + + vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); + for (int c = postamble_start; c < m_cols; c++) + { + *result_in_batch += matrix_row[c] * vector_in_batch[c]; + } + matrix_row += m_cols; + result_in_batch += result_stride; + } + } +} + +inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + int32_t *scratch, float *__restrict__ result, + int result_stride, ruy::Context *ruy_context) +{ + if (m_rows % 4 == 0 && result_stride == 1) + { + const int32_t *bias = static_cast<const int32_t *>(nullptr); + NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, + /*output_zp =*/0, scratch, ruy_context); + + // Multiply by float scaling factors and write to result + const int total_size = n_batch * m_rows; + int i = 0; + for (; i <= total_size - 8; i += 8, result += 8 * result_stride) + { + const float batch_scaling_factor0 = scaling_factors[i / m_rows]; + const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows]; + const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0); + const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1); + const int32x4_t scratch_val0 = vld1q_s32(scratch + i); + const int32x4_t scratch_val1 = vld1q_s32(scratch + i + 4); + const float32x4_t float_val0 = vcvtq_f32_s32(scratch_val0); + const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1); + const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0); + const float32x4_t result1 = + vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1); + vst1q_f32(result, result0); + vst1q_f32(result + 4 * result_stride, result1); + } + scratch += i; + for (; i < total_size; i++, result += result_stride) + { + const float batch_scaling_factor = scaling_factors[i / m_rows]; + int32_t x = *(scratch++); + *result += x * batch_scaling_factor; + } + return; + } + NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, + result, result_stride); +} + +} // namespace cker +} // namespace nnfw + +#endif // USE_NEON + +#endif // __NNFW_CKER_NEON_TENSOR_UTILS_H__ diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h new file mode 100644 index 000000000..3b3b27f72 --- /dev/null +++ b/compute/cker/include/cker/PortableTensorUtils.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__ +#define __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__ + +#include "cker/Types.h" +#include "cker/neon/neon_check.h" +#include <ruy/context.h> + +#include <cstring> +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +class ActivationFunctor +{ +public: + explicit ActivationFunctor(FusedActivationFunctionType act) : act_(act) {} + + float operator()(float a) const + { + switch (act_) + { + case FusedActivationFunctionType::kNone: + return a; + case FusedActivationFunctionType::kRelu: + return a < 0.f ? 0.f : a; + case FusedActivationFunctionType::kRelu6: + return std::max(0.f, std::min(a, 6.f)); + case FusedActivationFunctionType::kTanh: + return std::tanh(a); + case FusedActivationFunctionType::kSigmoid: + return 1.0f / (1.0f + std::exp(-a)); + default: + // TODO(aselle): More informative fatal error! + exit(1); + } + } + +private: + FusedActivationFunctionType act_; +}; + +template <typename T> +void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value) +{ + for (int i = 0; i < v_size; i++) + { + vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value)); + } +} + +inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch, + float *batch_vector) +{ + for (int b = 0; b < n_batch; b++) + { + memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float)); + } +} + +inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch, + float *batch_vector) +{ + for (int b = 0; b < n_batch; b++) + { + for (int i = 0; i < v_size; ++i) + { + batch_vector[i] += vector[i]; + } + batch_vector += v_size; + } +} + +inline bool PortableIsZeroVector(const float *vector, int v_size) +{ + for (int i = 0; i < v_size; ++i) + { + if (*vector++ != 0.0f) + return false; + } + return true; +} + +inline void PortableApplyActivationToVector(const float *vector, int v_size, + FusedActivationFunctionType activation, float *result) +{ + auto activation_func = ActivationFunctor(activation); + for (int v = 0; v < v_size; v++) + { + *result++ = (activation_func)(*vector++); + } +} + +inline void PortableSub1Vector(const float *vector, int v_size, float *result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ = 1.0f - *vector++; + } +} + +inline void PortableSymmetricQuantizeFloats(const float *values, const int size, + int8_t *quantized_values, float *min_value, + float *max_value, float *scaling_factor) +{ + auto minmax = std::minmax_element(values, values + size); + *min_value = *minmax.first; + *max_value = *minmax.second; + const int kScale = 127; + const float range = std::max(std::abs(*min_value), std::abs(*max_value)); + if (range == 0) + { + memset(quantized_values, 0, size * sizeof(int8_t)); + *scaling_factor = 1; + return; + } + *scaling_factor = range / kScale; + const float scaling_factor_inv = kScale / range; + for (int i = 0; i < size; ++i) + { + const int32_t quantized_value = + static_cast<int32_t>(std::round(values[i] * scaling_factor_inv)); + // Clamp: just in case some odd numeric offset. + quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); + } +} + +inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + float *__restrict__ result, + int result_stride) +{ + int batch, row, col; + for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) + { + const float batch_scaling_factor = scaling_factors[batch]; + // Get the address of the first row. + const int8_t *row_ptr = matrix; + for (row = 0; row < m_rows; ++row, result += result_stride) + { + // Initialize the dot product sum for the row to 0. + int32_t dotprod = 0; +#if defined(__GNUC__) + // Prefetch the row to cache. + __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */); +#endif + for (col = 0; col < m_cols; ++col, ++row_ptr) + { + dotprod += (*row_ptr) * (vectors[col]); + } // for col + *result += (dotprod * batch_scaling_factor); + } // for row + } // for batch +} + +inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vector, + const float *scaling_factors, int n_batch, + int32_t *, float *__restrict__ result, + int result_stride, ruy::Context *) +{ + PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors, + n_batch, result, result_stride); +} + +inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, + float *result, int result_stride) +{ + float *result_in_batch = result; + for (int b = 0; b < n_batch; b++) + { + const float *matrix_ptr = matrix; + for (int r = 0; r < m_rows; r++) + { + float dot_prod = 0.0f; + const float *vector_in_batch = vector + b * m_cols; + for (int c = 0; c < m_cols; c++) + { + dot_prod += *matrix_ptr++ * *vector_in_batch++; + } + *result_in_batch += dot_prod; + result_in_batch += result_stride; + } + } +} + +inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector, + int v_size, int n_batch) +{ + for (int batch = 0; batch < n_batch; ++batch) + { + float sum = 0.0f; + for (int i = 0; i < v_size; ++i) + { + sum += input_vector[i]; + } + const float mean = sum / v_size; + float sum_diff_sq = 0.0f; + for (int i = 0; i < v_size; ++i) + { + const float diff = input_vector[i] - mean; + sum_diff_sq += diff * diff; + } + const float variance = sum_diff_sq / v_size; + constexpr float kNormalizationConstant = 1e-8f; + const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant); + for (int i = 0; i < v_size; ++i) + { + output_vector[i] = (input_vector[i] - mean) * stddev_inv; + } + input_vector += v_size; + output_vector += v_size; + } +} + +inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); } + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__ diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h new file mode 100644 index 000000000..86caf7d18 --- /dev/null +++ b/compute/cker/include/cker/Shape.h @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SHAPE_H__ +#define __NNFW_CKER_SHAPE_H__ + +#include <algorithm> +#include <cstring> +#include <cassert> +#include <vector> + +#define UNUSED_RELEASE(a) (void)(a) + +namespace nnfw +{ +namespace cker +{ + +class Shape +{ +public: + // Shapes with dimensions up to 5 are stored directly in the structure, while + // larger shapes are separately allocated. + static constexpr int kMaxSmallSize = 5; + + Shape &operator=(Shape const &) = delete; + + Shape() : _size(0) {} + + explicit Shape(int dimensions_count) : _size(dimensions_count) + { + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + Shape(int shape_size, int32_t value) : _size(0) + { + Resize(shape_size); + for (int i = 0; i < shape_size; ++i) + { + SetDim(i, value); + } + } + + Shape(int dimensions_count, const int32_t *dims_data) : _size(0) + { + ReplaceWith(dimensions_count, dims_data); + } + + Shape(const std::initializer_list<int> init_list) : _size(0) { BuildFrom(init_list); } + + // Avoid using this constructor. We should be able to delete it when C++17 + // rolls out. + Shape(Shape const &other) : _size(other.DimensionsCount()) + { + if (_size > kMaxSmallSize) + { + _dims_pointer = new int32_t[_size]; + } + std::memcpy(DimsData(), other.DimsData(), sizeof(int32_t) * _size); + } + + bool operator==(const Shape &comp) const + { + return this->_size == comp._size && + std::memcmp(DimsData(), comp.DimsData(), _size * sizeof(int32_t)) == 0; + } + + ~Shape() + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + } + + inline int32_t DimensionsCount() const { return _size; } + inline int32_t Dims(int i) const + { + assert(i >= 0); + assert(i < _size); + return _size > kMaxSmallSize ? _dims_pointer[i] : _dims[i]; + } + inline void SetDim(int i, int32_t val) + { + assert(i >= 0); + assert(i < _size); + if (_size > kMaxSmallSize) + { + _dims_pointer[i] = val; + } + else + { + _dims[i] = val; + } + } + + inline int32_t *DimsData() { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + inline const int32_t *DimsData() const { return _size > kMaxSmallSize ? _dims_pointer : _dims; } + // The caller must ensure that the shape is no bigger than 4-D. + inline const int32_t *DimsDataUpTo4D() const { return _dims; } + + inline void Resize(int dimensions_count) + { + if (_size > kMaxSmallSize) + { + delete[] _dims_pointer; + } + _size = dimensions_count; + if (dimensions_count > kMaxSmallSize) + { + _dims_pointer = new int32_t[dimensions_count]; + } + } + + inline void ReplaceWith(int dimensions_count, const int32_t *dims_data) + { + Resize(dimensions_count); + int32_t *dst_dims = DimsData(); + std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); + } + + inline void ReplaceWith(const Shape &other) + { + ReplaceWith(other.DimensionsCount(), other.DimsData()); + } + + inline void ReplaceWith(Shape &&other) + { + Resize(0); + std::swap(_size, other._size); + if (_size <= kMaxSmallSize) + std::copy(other._dims, other._dims + kMaxSmallSize, _dims); + else + _dims_pointer = other._dims_pointer; + } + + template <typename T> inline void BuildFrom(const T &src_iterable) + { + const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end()); + Resize(dimensions_count); + int32_t *data = DimsData(); + for (auto it : src_iterable) + { + *data = it; + ++data; + } + } + + // This will probably be factored out. Old code made substantial use of 4-D + // shapes, and so this function is used to extend smaller shapes. Note that + // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be + // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their + // inputs should already be 4-D, so this function should not be needed. + inline static Shape ExtendedShape(int new_shape_size, const Shape &shape) + { + return Shape(new_shape_size, shape, 1); + } + + inline void BuildFrom(const std::initializer_list<int> init_list) + { + BuildFrom<const std::initializer_list<int>>(init_list); + } + + // Returns the total count of elements, that is the size when flattened into a + // vector. + inline int FlatSize() const + { + int buffer_size = 1; + const int *dims_data = DimsData(); + for (int i = 0; i < _size; i++) + { + const int dim = dims_data[i]; + buffer_size *= dim; + } + return buffer_size; + } + + bool operator!=(const Shape &comp) const { return !((*this) == comp); } + +private: + // For use only by ExtendedShape(), written to guarantee (return-value) copy + // elision in C++17. + // This creates a shape padded to the desired size with the specified value. + Shape(int new_shape_size, const Shape &shape, int pad_value) : _size(0) + { + assert(new_shape_size >= shape.DimensionsCount()); + assert(new_shape_size <= kMaxSmallSize); + Resize(new_shape_size); + const int size_increase = new_shape_size - shape.DimensionsCount(); + for (int i = 0; i < size_increase; ++i) + { + SetDim(i, pad_value); + } + std::memcpy(DimsData() + size_increase, shape.DimsData(), + sizeof(int32_t) * shape.DimensionsCount()); + } + + int32_t _size; + union { + int32_t _dims[kMaxSmallSize]; + int32_t *_dims_pointer{nullptr}; + }; +}; + +inline int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2) +{ + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + assert(shape1.Dims(index1) == shape2.Dims(index2)); + return shape1.Dims(index1); +} + +template <typename... Args> +int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args) +{ + assert(shape1.Dims(index1) == shape2.Dims(index2)); + UNUSED_RELEASE(shape2); + UNUSED_RELEASE(index2); + return MatchingDim(shape1, index1, args...); +} + +inline Shape GetShape(const std::vector<int32_t> &data) { return Shape(data.size(), data.data()); } + +inline int Offset(const Shape &shape, int i0, int i1, int i2, int i3) +{ + assert(shape.DimensionsCount() == 4); + const int *dims_data = shape.DimsDataUpTo4D(); + assert(i0 >= 0 && i0 < dims_data[0]); + assert(i1 >= 0 && i1 < dims_data[1]); + assert(i2 >= 0 && i2 < dims_data[2]); + assert(i3 >= 0 && i3 < dims_data[3]); + return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3; +} + +inline int Offset(const Shape &shape, int *index) +{ + return Offset(shape, index[0], index[1], index[2], index[3]); +} + +inline int FlatSizeSkipDim(const Shape &shape, int skip_dim) +{ + const int dims_count = shape.DimensionsCount(); + assert(skip_dim >= 0 && skip_dim < dims_count); + const auto *dims_data = shape.DimsData(); + int flat_size = 1; + for (int i = 0; i < dims_count; ++i) + { + flat_size *= (i == skip_dim) ? 1 : dims_data[i]; + } + return flat_size; +} + +// Flat size calculation, checking that dimensions match with one or more other +// arrays. +template <typename... Ts> inline bool checkMatching(const Shape &shape, Ts... check_shapes) +{ + const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...}; + for (const auto &check_shape : check_shapes_array) + { + // Check matching of shapes except the case of that two shapes can be scalar + if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 || + check_shape.FlatSize() != 1) + { + if (shape.DimensionsCount() != check_shape.DimensionsCount()) + { + return false; + } + for (int i = 0; i < shape.DimensionsCount(); ++i) + { + if (shape.Dims(i) != check_shape.Dims(i)) + { + return false; + } + } + } + } + return true; +} + +struct UNUSED_ALL +{ + template <typename... Args> UNUSED_ALL(Args const &...) {} +}; +template <typename... Ts> inline int MatchingFlatSize(const Shape &shape, Ts... check_shapes) +{ + UNUSED_ALL{check_shapes...}; + assert(checkMatching(shape, std::forward<Ts>(check_shapes)...)); + return shape.FlatSize(); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return FlatSizeSkipDim(shape, skip_dim); +} + +inline int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + UNUSED_RELEASE(check_shape_0); + const int dims_count = shape.DimensionsCount(); + for (int i = 0; i < dims_count; ++i) + { + if (i != skip_dim) + { + assert(shape.Dims(i) == check_shape_0.Dims(i)); + } + } + return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1); +} + +inline int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0, + const Shape &check_shape_1) +{ + const int size_1 = shape.FlatSize(); + const int size_2 = check_shape_0.FlatSize(); + const int size_3 = check_shape_1.FlatSize(); + assert(size_1 == size_2); + assert(size_2 == size_3); + UNUSED_RELEASE(size_2); + UNUSED_RELEASE(size_3); + return size_1; +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SHAPE_H__ diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h new file mode 100644 index 000000000..bac79b887 --- /dev/null +++ b/compute/cker/include/cker/TensorUtils.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TENSOR_UTILS_H__ +#define __NNFW_CKER_TENSOR_UTILS_H__ + +#include "cker/Types.h" +#include "cker/PortableTensorUtils.h" +#include "cker/NeonTensorUtils.h" +#include "cker/neon/neon_check.h" + +#include <cstring> +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void CwiseClipping(float *vector, const int v_size, const float clipping_value) +{ + NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value); +} + +inline void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector) +{ + PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector); +} + +inline void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, + float *batch_vector) +{ + PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); +} + +// Cwise product of two vectors. +template <typename T> +inline void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2, + int v_size, T *__restrict__ result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ = *vector1++ * *vector2++; + } +} + +// Cwise product and accumulate of two vectors. Since it's a MAC operation, the +// assumption here is that result array is initialized to valid values. +template <typename T> +inline void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1, + const T *__restrict__ vector2, int v_size, + T *__restrict__ result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ += *vector1++ * *vector2++; + } +} + +// Cwise product of a vector and a batch-vector. +template <typename T> +inline void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector, + int n_batch, T *result) +{ + for (int b = 0; b < n_batch; b++) + { + VectorVectorCwiseProduct(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} + +// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC +// operation, the assumption here is that result array is initialized to valid +// values. +template <typename T> +inline void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size, + const T *batch_vector, int n_batch, T *result) +{ + for (int b = 0; b < n_batch; b++) + { + VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} + +inline bool IsZeroVector(const float *vector, int v_size) +{ + return NEON_OR_PORTABLE(IsZeroVector, vector, v_size); +} + +inline void ApplyActivationToVector(const float *vector, int v_size, + FusedActivationFunctionType activation, float *result) +{ + PortableApplyActivationToVector(vector, v_size, activation, result); +} + +inline void Sub1Vector(const float *vector, int v_size, float *result) +{ + NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result); +} + +inline void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, + float *min, float *max, float *scaling_factor) +{ + return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max, + scaling_factor); +} + +inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, + const int m_cols, const int8_t *vector, + const float *scaling_factors, int n_batch, + float *result, int result_stride) +{ + NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, + scaling_factors, n_batch, result, result_stride); +} + +inline void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, float *result, + int result_stride) +{ + NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch, + result, result_stride); +} + +inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, + const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, + int32_t *scratch, float *result, int result_stride, + ruy::Context *ruy_context) +{ + NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors, + scaling_factors, n_batch, scratch, result, result_stride, ruy_context); +} + +inline void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size, + int n_batch) +{ + PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch); +} + +inline void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); } + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TENSOR_UTILS_H__ diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h new file mode 100644 index 000000000..acb6cac55 --- /dev/null +++ b/compute/cker/include/cker/Types.h @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TYPES_H__ +#define __NNFW_CKER_TYPES_H__ + +#include <cstdint> +#include <type_traits> +#include <limits> +#include <string> + +namespace nnfw +{ +namespace cker +{ + +enum class FusedActivationFunctionType +{ + kNone = 0, + kRelu6 = 1, + kRelu1 = 2, + kRelu = 3, + kTanh = 4, + kSigmoid = 6, +}; +enum class PaddingType +{ + kNone = 0, + kSame = 1, + kValid = 2, +}; + +enum class BinaryArithmeticOpType +{ + ADD = 0, + SUB = 1, + MUL = 2, + DIV = 3, + POW = 4, +}; + +enum class ComparisonOpType +{ + Equal, + NotEqual, + Greater, + GreaterEqual, + Less, + LessEqual +}; + +struct PaddingValues +{ + int16_t width; + int16_t height; +}; + +enum class BroadcastableOpCategory : uint8_t +{ + kNone, + kNonBroadcast, // Matching input shapes. + kFirstInputBroadcastsFast, // Fivefold nested loops. + kSecondInputBroadcastsFast, // Fivefold nested loops. + kGenericBroadcast, // Fall-back. +}; + +struct PoolParams +{ + FusedActivationFunctionType activation; + PaddingType padding_type; + PaddingValues padding_values; + int stride_height; + int stride_width; + int filter_height; + int filter_width; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct SoftmaxParams +{ + // beta is not really used (not a Tensorflow parameter) and not implemented + // for LogSoftmax. + double beta; + int axis; + // uint8 inference params. Used even when beta defaults to 1.0. + int32_t input_multiplier; + int32_t input_left_shift; + // Reverse scaling is only used by LogSoftmax. + int32_t reverse_scaling_divisor; + int32_t reverse_scaling_right_shift; + int diff_min; + int32_t zero_point; + float scale; + float *table; +}; + +struct PackParams +{ + int8_t axis; + // zeropoint and scale were only used to implement PackWithScaling in the legacy code of + // tensorflow + // const int32_t* input_zeropoint; + // const float* input_scale; + uint16_t inputs_count; + // int32_t output_zeropoint; + // float output_scale; +}; + +struct UnpackParams +{ + uint16_t num_split; + int16_t axis; +}; + +struct ConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; + bool is_replaced_weights{false}; +}; + +struct ComparisonParams +{ + ComparisonOpType type; + int left_shift; + int input1_shift; + int input2_shift; + int32_t input1_offset; + int32_t input1_multiplier; + int32_t input2_offset; + int32_t input2_multiplier; + bool is_broadcast; +}; + +struct BinaryArithmeticOpParam +{ + // Shape dependent / common to data / op types. + BroadcastableOpCategory broadcast_category{BroadcastableOpCategory::kNone}; + // uint8 inference params. + int32_t input1_offset = 0; + int32_t input2_offset = 0; + int32_t output_offset = 0; + int32_t output_multiplier = 0; + int32_t output_shift = 0; + // Add / Sub, not Mul, uint8 inference params. + int32_t left_shift = 0; + int32_t input1_multiplier = 0; + int32_t input1_shift = 0; + int32_t input2_multiplier = 0; + int32_t input2_shift = 0; + // uint8, etc, activation params. + int32_t quantized_activation_min = 0; + int32_t quantized_activation_max = 0; + // float activation params. + float float_activation_min = 0; + float float_activation_max = 0; + + // Processed output dimensions. + // Let input "a" be the one that broadcasts in the faster-changing dimension. + // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and + // {b0, b1, b2, b3, b4}, + // broadcast_shape[4] = b0 = a0. + // broadcast_shape[3] = b1; a1 = 1. + // broadcast_shape[2] = b2 = a2. + // broadcast_shape[1] = a3; b3 = 1. + // broadcast_shape[0] = b4 = a4. + int broadcast_shape[5] = {}; +}; + +struct TransposeParams +{ + int8_t perm_count; + int32_t perm[4]; +}; + +struct ConcatenationParams +{ + int8_t axis; + const int32_t *input_zeropoint; + const float *input_scale; + uint16_t inputs_count; + int32_t output_zeropoint; + float output_scale; +}; + +struct DepthwiseConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + int16_t depth_multiplier; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct FullyConnectedParams +{ + FusedActivationFunctionType activation{FusedActivationFunctionType::kNone}; + // uint8 inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + float weights_scale; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params - no one use this params, but ruy might use them later. + // float float_activation_min; + // float float_activation_max; + // FullyConnectedWeightsFormat weights_format; +}; + +struct L2NormParams +{ + // uint8 inference params. + int32_t input_zero_point; +}; + +enum LSTMKernelType +{ + kTfLiteLSTMFullKernel = 0, + kTfLiteLSTMBasicKernel +}; + +struct LSTMParams +{ + // Parameters for LSTM version 1. + FusedActivationFunctionType activation{FusedActivationFunctionType::kNone}; + float cell_clip; + float proj_clip; + + // Parameters for LSTM version 2. + // kTfLiteLSTMBasicKernel is only supported in version 2 or above. + LSTMKernelType kernel_type; + + // Parameters for LSTM version 4. + bool asymmetric_quantize_inputs; +}; + +struct GatherParams +{ + int32_t axis; +}; + +struct InstanceNormParams +{ + float epsilon; + float float_activation_min; + float float_activation_max; +}; + +struct ResizeBilinearParams +{ + int32_t output_height; + int32_t output_width; + bool align_corners; + bool half_pixel_centers; +}; + +struct TransposeConvParams +{ + PaddingType padding_type; + PaddingValues padding_values; + // TODO(starka): This was just "stride", so check that width+height is OK. + int16_t stride_width; + int16_t stride_height; + int16_t dilation_width_factor; + int16_t dilation_height_factor; + // uint8_t inference params. + // TODO(b/65838351): Use smaller types if appropriate. + int32_t input_offset; + int32_t weights_offset; + int32_t output_offset; + int32_t output_multiplier; + int output_shift; + // uint8_t, etc, activation params. + int32_t quantized_activation_min; + int32_t quantized_activation_max; + // float activation params. + float float_activation_min; + float float_activation_max; +}; + +struct SliceParams +{ + int8_t begin_count; + int32_t begin[4]; + int8_t size_count; + int32_t size[4]; +}; + +struct StridedSliceParams +{ + int8_t start_indices_count; + int16_t start_indices[4]; + int8_t stop_indices_count; + int16_t stop_indices[4]; + int8_t strides_count; + int16_t strides[4]; + + int16_t begin_mask; + int16_t ellipsis_mask; + int16_t end_mask; + int16_t new_axis_mask; + int16_t shrink_axis_mask; +}; + +struct SplitParams +{ + uint16_t num_split; + int16_t axis; +}; + +struct SplitVParams +{ + uint16_t num_split; + int16_t axis; +}; + +struct FusedBatchNormParams +{ + bool is_training; + std::string data_format; // UNKNOWN(0), NHWC(1), NCHW(2) + float epsilon; +}; + +struct SpaceToBatchParams +{ + // "Zero" padding for uint8 means padding with the output offset. + int32_t output_offset; +}; + +struct SpaceToDepthParams +{ + int32_t block_size; +}; + +enum class Order +{ + kColMajor, + kRowMajor +}; + +enum class CachePolicy : std::uint8_t +{ + kNeverCache, + kCacheIfLargeSpeedup, + kAlwaysCache, +}; + +// MatrixParams encapsulates the parameters that Gemm needs about each +// matrix, besides the buffer data pointer. +// Compare to ruy::Matrix, which also encapsulates the data pointer. +// Rationale for leaving the data pointer out of here: doing so +// requires complicated const-correctness mechanics. See +// ruy::ConstCheckingPtr. +template <typename Scalar> struct MatrixParams +{ + // Storage layout order. For now we only do plain linear non-strided + // layout. It would be easy to support a stride if needed. + Order order = Order::kColMajor; + // Number of rows of the matrix. + int rows = 0; + // Number of columns of the matrix. + int cols = 0; + // The zero_point, i.e. which Scalar value is to be interpreted as zero. + // When Scalar is floating-point, this must be 0. + Scalar zero_point = 0; + // When the data pointed to by this matrix is constant data, so that it is + // valid to assume that equality of pointers implies equality of data, + // a CachePolicy may be used instead of the default kNeverCache, + // which will enable ruy to take advantage of this constancy of the data to + // cache the packing work, which can be a large speedup in matrix*vector + // and other narrow shapes. + CachePolicy cache_policy = CachePolicy::kNeverCache; +}; + +// Enumeration of broad categories of Gemm. +// +// The primary reason for this to exist is to allow Gemm to compile +// only uniform-quantized or only per-channel-quantized code paths. +// This is unneeded with ruy as the back-end, as this is only a runtime +// difference in ruy, but with gemmlowp these really are separate code +// paths and templatizing in a QuantizationFlavor is necessary to avoid +// compiling unused gemmlowp code. Indeed, TFLite currently uses +// uint8 with uniform quantization and int8 with per-channel quantization, +// and does not use uint8 with per-channel. We want to avoid compiling +// the gemmlowp uint8 per-channel path when gemmlowp is the back-end. +// +// It's possible to drop this in the future if gemmlowp goes away and no +// other then-relevant backend library handles quantized paths in a way that +// requires knowing this at compile-time. +enum class QuantizationFlavor +{ + // Floating-point Gemm: the accumulators are not multiplied by any + // 'multiplier'. + kFloatingPoint, + // Quantized Gemm using a single multiplier for all accumulators. + kIntegerWithUniformMultiplier, + // Quantized Gemm using a separate multipliers for accumulators of each + // row of the destination matrix. This is what is called 'per-channel' + // in GemmParams. Here we use the more specific 'per-row' terminology + // to allow for the possibility of 'per-column' in the future, and to + // allow for that to be a separate code path in some back-end such as + // gemmlowp. + kIntegerWithPerRowMultiplier +}; + +// Additional parameters that Gemm needs, beyond what falls into +// the MatrixParams that it takes. Compare to ruy::Spec. +// +// Decoupling AccumScalar from DstScalar (rather than deducing it from that) +// is useful future-proofing. Think of a float16 path using float32 accum. +// +// QuantizationFlavor is passed here even though it's technically not used +// in this class. This is so that we retain the ability in the future to +// specialize this class for quantization flavor, and this allows for +// Gemm to be templatized in quantization_flavor via the GemmParams that it +// takes, allowing for automatic template parameter deduction to take place, +// so that most call sites don't need to specify a QuantizationFlavor +// (only those that need perchannel quantization do). +template <typename AccumScalar, typename DstScalar, + QuantizationFlavor quantization_flavor = + std::is_floating_point<AccumScalar>::value + ? QuantizationFlavor::kFloatingPoint + : QuantizationFlavor::kIntegerWithUniformMultiplier> +struct GemmParams +{ + // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa) + // of the multiplier by which accumulators are multiplied before being casted + // to the destination type. + AccumScalar multiplier_fixedpoint = 0; + // Only for non-floating-point cases. The exponent part of the aforementioned + // multiplier. + int multiplier_exponent = 0; + // Per-channel variant of multiplier_fixedpoint. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_fixedpoint. + const AccumScalar *multiplier_fixedpoint_perchannel = nullptr; + // Per-channel variant of multiplier_exponent. If not nullptr, this must + // point to a buffer of as many values as there are rows in the destination + // matrix. Each row of the destination matrix will use the corresponding + // buffer element instead of multiplier_exponent. + // + // Either none or both of multiplier_exponent_perchannel and + // multiplier_fixedpoint_perchannel must be nullptr. + const int *multiplier_exponent_perchannel = nullptr; + // The bias vector data, if not null. + const AccumScalar *bias = nullptr; + // min clamp bound of destination values. + DstScalar clamp_min = std::is_floating_point<DstScalar>::value + ? -std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::lowest(); + // max clamp bound of destination values. + DstScalar clamp_max = std::is_floating_point<DstScalar>::value + ? std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::max(); +}; + +// Validates self-consistency of GemmParams. +template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor> +void ValidateGemmParams(const GemmParams<AccumScalar, DstScalar, quantization_flavor> ¶ms) +{ + // Guard consistency of the quantized multiplier fields. + if (quantization_flavor == QuantizationFlavor::kFloatingPoint) + { + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier && + !std::is_same<DstScalar, int32_t>::value) + { + assert(params.multiplier_fixedpoint); + // Nothing to check about multiplier_exponent + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier && + !std::is_same<DstScalar, int32_t>::value) + { + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(params.multiplier_fixedpoint_perchannel); + assert(params.multiplier_exponent_perchannel); + } + else + { + // For the get raw accumulator case, we should make sure none of the + // quantization params are set. + assert(!params.multiplier_fixedpoint); + assert(!params.multiplier_exponent); + assert(!params.multiplier_fixedpoint_perchannel); + assert(!params.multiplier_exponent_perchannel); + } + UNUSED_RELEASE(params); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TYPES_H__ diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h new file mode 100644 index 000000000..2abb998d0 --- /dev/null +++ b/compute/cker/include/cker/Utils.h @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UTILS_H__ +#define __NNFW_CKER_UTILS_H__ + +#include "Shape.h" + +#include <algorithm> +#include <cstdint> +#include <fixedpoint/fixedpoint.h> + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max) +{ + return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max); +} + +inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift) +{ + if (double_multiplier == 0.) + { + *quantized_multiplier = 0; + *shift = 0; + return; + } + + const double q = std::frexp(double_multiplier, shift); + auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31))); + + assert(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) + { + q_fixed /= 2; + ++*shift; + } + assert(q_fixed <= std::numeric_limits<int32_t>::max()); + // A shift amount smaller than -31 would cause all bits to be shifted out + // and thus all results would be zero. We implement that instead with + // q_fixed==0, so as to avoid hitting issues with right-shift + // operations with shift amounts greater than 31. Note that this happens + // roughly when abs(double_multiplier) < 2^-31 and the present handling means + // that we're effectively flushing tiny double_multiplier's to zero. + // We could conceivably handle values in the range (roughly) [32, 63] + // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view + // the present handling is just doing 'flush denormals to zero'. We could + // reconsider and actually generate nonzero denormals if a need arises. + if (*shift < -31) + { + *shift = 0; + q_fixed = 0; + } + *quantized_multiplier = static_cast<int32_t>(q_fixed); +} + +inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, + int32_t *quantized_multiplier, int *left_shift) +{ + assert(double_multiplier < 1.0); + assert(double_multiplier > 0.0); + int shift; + QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift); + assert(shift <= 0); + *left_shift = shift; +} + +inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), + right_shift); +} + +inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, + int left_shift) +{ + return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier); +} + +inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, + int32_t quantized_multiplier, + int left_shift) +{ + return gemmlowp::RoundingDivideByPOT( + gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); +} + +inline int NodeOffset(int b, int h, int w, int height, int width) +{ + return (b * height + h) * width + w; +} + +inline int CountLeadingZeros(uint32_t integer_input) +{ + const uint32_t one_in_leading_positive = 1U << 31; + int leading_zeros = 0; + while (integer_input < one_in_leading_positive) + { + integer_input <<= 1; + ++leading_zeros; + } + return leading_zeros; +} + +inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, + int32_t *output_inv_sqrt, int *output_shift) +{ + assert(input >= 0); + if (input <= 1) + { + // Handle the input value 1 separately to avoid overflow in that case + // in the general computation below (b/143972021). Also handle 0 as if it + // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid + // but rare/unrealistic input value. We can expect both to occur in some + // incompletely trained models, but probably not in fully trained models. + *output_inv_sqrt = std::numeric_limits<std::int32_t>::max(); + *output_shift = 0; + return; + } + assert(input > 1); + *output_shift = 11; + while (input >= (1 << 29)) + { + input /= 4; + ++*output_shift; + } + const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1; + const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; + const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; + *output_shift -= left_shift_bit_pairs; + input <<= 2 * left_shift_bit_pairs; + assert(input >= (1 << 27)); + assert(input < (1 << 29)); + using gemmlowp::FixedPoint; + using gemmlowp::Rescale; + using gemmlowp::SaturatingRoundingMultiplyByPOT; + // Using 3 integer bits gives us enough room for the internal arithmetic in + // this Newton-Raphson iteration. + using F3 = FixedPoint<int32_t, 3>; + using F0 = FixedPoint<int32_t, 0>; + const F3 fixedpoint_input = F3::FromRaw(input >> 1); + const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); + const F3 fixedpoint_half_three = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + // Newton-Raphson iteration + // Naive unoptimized starting guess: x = 1 + F3 x = F3::One(); + // Naive unoptimized number of iterations: 5 + for (int i = 0; i < 5; i++) + { + const F3 x3 = Rescale<3>(x * x * x); + x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); + } + const F0 fixedpoint_half_sqrt_2 = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + x = x * fixedpoint_half_sqrt_2; + *output_inv_sqrt = x.raw(); + if (*output_shift < 0) + { + *output_inv_sqrt <<= -*output_shift; + *output_shift = 0; + } + // Convert right shift (right is positive) to left shift. + *output_shift *= reverse_shift; +} + +// Comment from tensorflow lite: +// +// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional +// rectangular array of numbers. +// +// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. +// However, as Dims<N> is to be deprecated, this class exists as an adaptor +// to enable simple unoptimized implementations of element-wise broadcasting +// operations. +template <int N> struct NdArrayDesc +{ + // The "extent" of each dimension. Indices along dimension d must be in the + // half-open interval [0, extents[d]). + int extents[N]; + + // The number of *elements* (not bytes) between consecutive indices of each + // dimension. + int strides[N]; +}; + +// Comment from tensorflow lite: +// +// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// BROADCASTING. +// +// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. +inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3) +{ + assert(i0 >= 0 && i0 < desc.extents[0]); + assert(i1 >= 0 && i1 < desc.extents[1]); + assert(i2 >= 0 && i2 < desc.extents[2]); + assert(i3 >= 0 && i3 < desc.extents[3]); + return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3]; +} + +template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter) +{ + int ret_indx = 0; + for (size_t idx = 0; idx < static_cast<size_t>(N); idx++) + { + assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]); + ret_indx += iter[idx] * desc->strides[idx]; + } + + return ret_indx; +} + +// Copies dims to desc, calculating strides. +template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out) +{ + int desc_stride = 1; + for (int i = N - 1; i >= 0; --i) + { + desc_out->extents[i] = input_shape.Dims(i); + desc_out->strides[i] = desc_stride; + desc_stride *= input_shape.Dims(i); + } +} + +template <int N> +inline void +NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, + NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out) +{ + assert(desc0_out != nullptr); + assert(desc1_out != nullptr); + + auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape); + auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape); + + // Copy dims to desc, calculating strides. + CopyDimsToDesc<N>(extended_input0_shape, desc0_out); + CopyDimsToDesc<N>(extended_input1_shape, desc1_out); + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) + { + const int extent0 = extended_input0_shape.Dims(i); + const int extent1 = extended_input1_shape.Dims(i); + if (extent0 != extent1) + { + if (extent0 == 1) + { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } + else + { + assert(extent1 == 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } +} + +template <int N> +inline void +NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, + const Shape &input2_shape, NdArrayDesc<N> *desc0_out, + NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out) +{ + assert(desc0_out != nullptr); + assert(desc1_out != nullptr); + assert(desc2_out != nullptr); + + auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape); + auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape); + auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape); + + // Copy dims to desc, calculating strides. + CopyDimsToDesc<N>(extended_input0_shape, desc0_out); + CopyDimsToDesc<N>(extended_input1_shape, desc1_out); + CopyDimsToDesc<N>(extended_input2_shape, desc2_out); + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) + { + const int extent0 = extended_input0_shape.Dims(i); + const int extent1 = extended_input1_shape.Dims(i); + const int extent2 = extended_input2_shape.Dims(i); + + int extent = extent0; + if (extent1 != 1) + extent = extent1; + if (extent2 != 1) + extent = extent2; + + assert(extent0 == 1 || extent0 == extent); + assert(extent1 == 1 || extent1 == extent); + assert(extent2 == 1 || extent2 == extent); + + if (!(extent0 == extent1 && extent1 == extent2)) + { + if (extent0 == 1) + { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent; + } + if (extent1 == 1) + { + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent; + } + if (extent2 == 1) + { + desc2_out->strides[i] = 0; + desc2_out->extents[i] = extent; + } + } + } +} + +// Gets next index to iterate through a multidimensional array. +inline bool NextIndex(const int num_dims, const int *dims, int *current) +{ + if (num_dims == 0) + { + return false; + } + assert(dims != nullptr); + assert(current != nullptr); + int carry = 1; + for (int idx = num_dims - 1; idx >= 0; --idx) + { + int current_val = current[idx] + carry; + assert(dims[idx] >= current_val); + if (dims[idx] == current_val) + { + current[idx] = 0; + } + else + { + current[idx] = current_val; + carry = 0; + break; + } + } + return (carry == 0); +} + +// Gets offset of index if reducing on axis. When reducing, the flattened offset +// will not change, if the input index changes on the given axis. For example, +// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0, +// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened +// offset. +// TODO(kanlig): uses Dims to represent dimensions. +inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, + const int num_axis, const int *axis) +{ + if (num_dims == 0) + { + return 0; + } + + assert(dims != nullptr); + assert(index != nullptr); + + size_t offset = 0; + for (int idx = 0; idx < num_dims; ++idx) + { + // if we need to skip this axis + bool is_axis = false; + if (axis != nullptr) + { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (idx == axis[axis_idx]) + { + is_axis = true; + break; + } + } + } + if (!is_axis) + { + offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]); + } + } + return offset; +} + +template <typename T> void optimized_ops_preload_l1_keep(const T *ptr) +{ +#ifdef __GNUC__ + // builtin offered by GCC-compatible compilers including clang + __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3); +#else + (void)ptr; +#endif +} + +// Writes randomly accessed values from `input` sequentially into `output`. +template <typename T> class SequentialTensorWriter +{ +public: + SequentialTensorWriter(const T *input_data, T *output_data) + : input_data_(input_data), output_ptr_(output_data) + { + } + + void Write(int position) { *output_ptr_++ = input_data_[position]; } + void WriteN(int position, int len) + { + memcpy(output_ptr_, &input_data_[position], sizeof(T) * len); + output_ptr_ += len; + } + +private: + const T *input_data_; + T *output_ptr_; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UTILS_H__ diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h new file mode 100644 index 000000000..49c34211a --- /dev/null +++ b/compute/cker/include/cker/eigen/EigenSupport.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__ +#define __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__ + +//#if defined(CKER_OPTIMIZED_EIGEN) + +#include <Eigen/Core> +#include <thread> +#include "cker/eigen/eigen_spatial_convolutions.h" + +#ifdef EIGEN_USE_THREADS +#include <unsupported/Eigen/CXX11/ThreadPool> +#endif + +namespace nnfw +{ +namespace cker +{ +namespace eigen_support +{ + +// Shorthands for the types we need when interfacing with the EigenTensor +// library. +typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, + Eigen::Aligned> + EigenMatrix; +typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>, + Eigen::Aligned> + ConstEigenMatrix; + +typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, + Eigen::Aligned> + EigenTensor; +typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>, + Eigen::Aligned> + ConstEigenTensor; + +// Utility functions we need for the EigenTensor API. +template <typename Device, typename T> struct MatMulConvFunctor +{ + // Computes on device "d": out = in0 * in1, where * is matrix + // multiplication. + void operator()(const Device &d, EigenMatrix out, ConstEigenMatrix in0, ConstEigenMatrix in1, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> &dim_pair) + { + out.device(d) = in0.contract(in1, dim_pair); + } +}; + +// We have a single global threadpool for all convolution operations. This means +// that inferences started from different threads may block each other, but +// since the underlying resource of CPU cores should be consumed by the +// operations anyway, it shouldn't affect overall performance. +class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface +{ +public: + // Takes ownership of 'pool' + explicit EigenThreadPoolWrapper(Eigen::ThreadPool *pool) : pool_(pool) {} + ~EigenThreadPoolWrapper() override {} + + void Schedule(std::function<void()> fn) override { pool_->Schedule(std::move(fn)); } + int NumThreads() const override { return pool_->NumThreads(); } + int CurrentThreadId() const override { return pool_->CurrentThreadId(); } + +private: + std::unique_ptr<Eigen::ThreadPool> pool_; +}; + +struct EigenContext +{ + constexpr static int default_num_threadpool_threads = 4; + std::unique_ptr<Eigen::ThreadPoolInterface> thread_pool_wrapper; + std::unique_ptr<Eigen::ThreadPoolDevice> device; + + EigenContext() + { + int num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + { + num_threads = default_num_threadpool_threads; + } + device.reset(); // destroy before we invalidate the thread pool + thread_pool_wrapper.reset(new EigenThreadPoolWrapper(new Eigen::ThreadPool(num_threads))); + device.reset(new Eigen::ThreadPoolDevice(thread_pool_wrapper.get(), num_threads)); + } + + static inline EigenContext &GetEigenContext() + { + static EigenContext instance; + return instance; + } +}; + +inline const Eigen::ThreadPoolDevice *GetThreadPoolDevice() +{ + auto &ctx = EigenContext::GetEigenContext(); + return ctx.device.get(); +} + +} // namespace eigen_support +} // namespace cker +} // namespace nnfw + +//#endif // defined(CKER_OPTIMIZED_EIGEN) + +#endif // __NNFW_CKER_EIGEN_EIGEN_SUPPORT_H__ diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h new file mode 100644 index 000000000..f9c706370 --- /dev/null +++ b/compute/cker/include/cker/eigen/Utils.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_UTILS_H__ +#define __NNFW_CKER_EIGEN_UTILS_H__ + +#include <Eigen/Core> +#include <type_traits> +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen vector expression. The std::conditional here is to +// construct the suitable Eigen type for the constness of the +// data. Indeed, for const data, we need to produce +// Eigen::Map<const Eigen::Matrix<float, ...>> +// and not the more straightforward +// Eigen::Map<Eigen::Matrix<const float, ...>> +template <typename Scalar> +using VectorMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; + +template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape) +{ + const int size = shape.FlatSize(); + return VectorMap<Scalar>(data, size, 1); +} + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen matrix expression. The same explanation as for VectorMap +// above also applies here. +template <typename Scalar> +using MatrixMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, + Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + +template <typename Scalar> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape) +{ + const int dims_count = shape.DimensionsCount(); + const int rows = shape.Dims(dims_count - 1); + const int cols = FlatSizeSkipDim(shape, dims_count - 1); + return MatrixMap<Scalar>(data, rows, cols); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EIGEN_UTILS_H__ diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h new file mode 100644 index 000000000..dc3e2552d --- /dev/null +++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__ +#define __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__ + +namespace Eigen +{ +namespace internal +{ + +// TensorEvaluatorHasPartialPacket<TensorEvaluatorType, PacketType, IndexType> +// provides `value` that is true if TensorEvaluatorType has `PacketType +// partialPacket<PacketType>(IndexType, unpacket_traits<PacketType>::mask_t) +// const` and if the PacketType supports masked load. +// +// Partial packets are used to: +// +// 1) Split the packet over two columns in eigen based spatial convolution and +// use partial loads for each individual part before combining them to get the +// required packet. This class is used to pick the correct implementation of +// loadPacketStandard function. +// +// 2) Split the packet over two rows (within the same column) in eigen based +// cuboid convolution and use partial loads for each individual part before +// combining them to get the required packet. This class is used to pick the +// correct implementation of loadPacketStandard function. This usage is similar +// to the usage in eigen based spatial convolution described above. +// +// 3) Finalize packing of columns in gemm_pack_colmajor after processing +// vectorized part with full packets (see eigen_spatial_convolutions.h). +template <typename TensorEvaluatorType, typename PacketType, typename IndexType> +class TensorEvaluatorHasPartialPacket +{ +public: + template <typename TensorEvaluatorT, typename PacketT, typename IndexT> + static auto functionExistsSfinae( + typename std::enable_if< + unpacket_traits<PacketT>::masked_load_available && + std::is_same< + PacketT, + decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>( + std::declval<IndexT>(), + std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *) + -> std::true_type; + + template <typename TensorEvaluatorT, typename PacketT, typename IndexT> + static auto functionExistsSfinae(...) -> std::false_type; + + typedef decltype( + functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status; + + static constexpr bool value = status::value; +}; + +// Compute a mask for loading/storing coefficients in/from a packet in a +// [from, to) range. If the mask bit is 1, element will be loaded/stored. +template <typename Packet> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename std::enable_if<unpacket_traits<Packet>::masked_load_available, + typename unpacket_traits<Packet>::mask_t>::type + mask(int from, int to) +{ + const Index packet_size = internal::unpacket_traits<Packet>::size; + eigen_assert(0 <= from && to <= (packet_size + 1) && from < to); + + using Mask = typename internal::unpacket_traits<Packet>::mask_t; + const Mask mask_max = std::numeric_limits<Mask>::max(); + + return (mask_max >> (packet_size - to)) ^ (mask_max >> (packet_size - from)); +} + +} // namespace internal +} // namespace Eigen + +#endif // __NNFW_CKER_EIGEN_EIGEN_CONVOLUTION_HELPERS_H__ diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h new file mode 100644 index 000000000..92e1614d1 --- /dev/null +++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h @@ -0,0 +1,1783 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__ +#define __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__ + +#include "cker/eigen/eigen_convolution_helpers.h" + +// Note this header is used in both TF and TFLite. +namespace Eigen +{ + +namespace internal +{ + +// WARNING: Most of the code here implicitly assumes that the matrix is in +// ColMajor layout. This is guaranteed by the tensor contraction (see +// TensorContraction.h). +// +// Inside Eigen a tensor contraction is represented by a matrix multiplication. +// We don't want to actually extract image patches and reshape the result into +// a matrix (this involves allocating huge extra memory), so the patch +// extraction and reshape operations are implicit. +// +// TensorContractionInputMapper takes a matrix index and returns the coefficient +// (or the packet) of the "virtual tensor", that would be at that index if we +// were to actually reshape the result of patch extraction. +// +// TensorContractionSubMapper provides a similar view into the "virtual matrix" +// at the given vertical and horizontal offsets. +// +// "Virtual matrix" dimensions: +// *0: kernelChannels * kernelRows * kernelCols; +// 1: out_height * out_width; * OTHERS (e.g batches, etc...) +// +// *) extracted patches are continuous in memory (innermost dimension assuming +// col major layout) +// +// With this dimensions: +// row - offset within a single patch (in code: patchId) +// col - index of the extracted patch (in code: patchIndex) +// patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions) +// +// TODO(ezhulenev): Consolidate this part of the code with the image patch +// extraction code since they are both very similar. + +template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device, + typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionInputMapper< + Scalar_, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> +{ +public: + typedef Scalar_ Scalar; + + typedef TensorContractionInputMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; + + typedef TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; + + typedef SubMapper VectorMapper; + typedef SubMapper LinearMapper; + typedef typename packet_traits<Scalar>::type Packet; + + typedef TensorEvaluator<ArgType, Device> TensorEvaluatorT; + + EIGEN_DEVICE_FUNC + TensorContractionInputMapper( + const TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device> &tensor, + const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &) + : m_impl(tensor.impl().impl()) + { + Index patch_rows; + Index patch_depth; + if (internal::traits<ArgType>::Layout == ColMajor) + { + patch_depth = tensor.impl().dimensions()[0]; + patch_rows = tensor.impl().dimensions()[1]; + m_patch_cols = tensor.impl().dimensions()[2]; + m_num_patches = tensor.impl().dimensions()[3]; + } + else + { + const size_t NumDims = tensor.impl().dimensions().size(); + patch_depth = tensor.impl().dimensions()[NumDims - 1]; + patch_rows = tensor.impl().dimensions()[NumDims - 2]; + m_patch_cols = tensor.impl().dimensions()[NumDims - 3]; + m_num_patches = tensor.impl().dimensions()[NumDims - 4]; + } + + // Strides for navigating through the single patch. + m_patch_row_stride = patch_depth; + m_patch_col_stride = patch_rows * m_patch_row_stride; + + m_patch_row_inflate_strides = tensor.impl().rowInflateStride(); + m_patch_col_inflate_strides = tensor.impl().colInflateStride(); + + m_colStride = patch_rows; + + m_outputRows = tensor.impl().outputRows(); + m_outputCols = tensor.impl().outputCols(); + m_row_strides = tensor.impl().userRowStride(); + m_col_strides = tensor.impl().userColStride(); + + m_in_row_strides = tensor.impl().userInRowStride(); + m_in_col_strides = tensor.impl().userInColStride(); + + if (internal::traits<ArgType>::Layout == ColMajor) + { + m_inputRows = tensor.impl().impl().dimensions()[1]; + m_inputCols = tensor.impl().impl().dimensions()[2]; + } + else + { + const int NumDims = tensor.impl().impl().dimensions().size(); + m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2]; + m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3]; + } + + m_rowInputStride = patch_depth; + m_colInputStride = patch_depth * m_inputRows; + m_patchInputStride = patch_depth * m_inputRows * m_inputCols; + + m_rowPaddingTop = tensor.impl().rowPaddingTop(); + m_colPaddingLeft = tensor.impl().colPaddingLeft(); + + m_fastPatchRowStride = internal::TensorIntDivisor<Index>(m_patch_row_stride); + m_fastPatchColStride = internal::TensorIntDivisor<Index>(m_patch_col_stride); + m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides); + m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches); + m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride); + m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows); + m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth); + } + + EIGEN_DEVICE_FUNC + TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper) + : m_impl(base_mapper.m_impl) + { + m_patch_cols = base_mapper.m_patch_cols; + m_num_patches = base_mapper.m_num_patches; + + m_patch_row_stride = base_mapper.m_patch_row_stride; + m_patch_col_stride = base_mapper.m_patch_col_stride; + + m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides; + m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides; + + m_colStride = base_mapper.m_colStride; + + m_rowInputStride = base_mapper.m_rowInputStride; + m_colInputStride = base_mapper.m_colInputStride; + m_patchInputStride = base_mapper.m_patchInputStride; + + m_inputRows = base_mapper.m_inputRows; + m_inputCols = base_mapper.m_inputCols; + + m_outputRows = base_mapper.m_outputRows; + m_outputCols = base_mapper.m_outputCols; + m_row_strides = base_mapper.m_row_strides; + m_col_strides = base_mapper.m_col_strides; + + m_in_row_strides = base_mapper.m_in_row_strides; + m_in_col_strides = base_mapper.m_in_col_strides; + + m_rowPaddingTop = base_mapper.m_rowPaddingTop; + m_colPaddingLeft = base_mapper.m_colPaddingLeft; + + m_fastPatchRowStride = base_mapper.m_fastPatchRowStride; + m_fastPatchColStride = base_mapper.m_fastPatchColStride; + m_fastInputRowStride = base_mapper.m_fastInputRowStride; + m_fastInputColStride = base_mapper.m_fastInputColStride; + m_fastNumPatches = base_mapper.m_fastNumPatches; + m_fastColStride = base_mapper.m_fastColStride; + m_fastOutputRows = base_mapper.m_fastOutputRows; + m_fastDimZero = base_mapper.m_fastDimZero; + } + + // If true, turns off some optimizations for loading packets since the image + // patches are "non-standard" such as there are non-trivial strides or + // inflations in the input. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const + { + return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || + m_patch_col_inflate_strides != 1; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const + { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const + { + return LinearMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const + { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(0, rowIndex, colIndex, otherIndex); + return loadCoeff(row, rowIndex, colIndex, otherIndex); + } + + // Load the coefficient at the patchIndex location instead of the usual + // m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. + // EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const + { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadCoeff(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const + { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(0, rowIndex, colIndex, otherIndex); + return loadPacket(row, rowIndex, colIndex, otherIndex); + } + + // Load the packet at the patchIndex location instead of the usual m_rowIndex, + // m_colIndex, m_otherIndex. This is currently only used by the gpu code. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const + { + Index rowIndex, colIndex, otherIndex; + computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex); + return loadPacket(row, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device> &impl() const { return m_impl; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; } + +private: + friend class TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; + + // Load coefficient from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, + Index otherIndex) const + { + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex + colOffset * m_in_col_strides; + const Index origInputCol = (m_patch_col_inflate_strides == 1) + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex + rowOffset * m_in_row_strides; + const Index origInputRow = (m_patch_row_inflate_strides == 1) + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols || + origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) || + (inputRow != origInputRow * m_patch_row_inflate_strides)) + { + return Scalar(0); + } + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = + depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; + return m_impl.coeff(inputIndex); + } + + // This is the same as loadCoeff(...), but optimized for all `inflate_strides` + // and `in_strides` equal to 1 (template specialization without templates). + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, + Index otherIndex) const + { + eigen_assert(!nonStandardPatches()); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + const Index colOffset = patchOffset / m_fastColStride; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputCol = colIndex + colOffset; + const Index inputRow = rowIndex + rowOffset; + if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) + { + return Scalar(0); + } + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.coeff(inputIndex); + } + + // Load packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, + Index otherIndex) const + { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols); + + if (nonStandardPatches()) + { + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + typedef decltype(m_impl) TensorEvaluatorT; + return loadPacketStandard<Packet, TensorEvaluatorT>(patchId, rowIndex, colIndex, otherIndex); + } + + // Helper function to load a 'partial' packet - this is the single column + // part of a packet that is split across two columns. In the 'partial' packet, + // the elements corresponding to the column (specified through colOffset) are + // loaded and the rest of the elements are zero-filled into the 'partial' + // packet. This function is called from loadPacketStandardFromTwoColumns(). + // This code path is exercised only when the packet type supports masked load + // and when the partial packet load is available in the TensorEvaluator. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPartialPacketStandard(Index rowIndex, Index colIndex, + Index otherIndex, Index patchId, + const Index span[], + const Index patchOffsets[], + Index colOffset) const + { + const Index inputCol = colIndex + colOffset; + const Index rowOffsets[2] = {patchOffsets[0] - colOffset * m_colStride, + patchOffsets[1] - colOffset * m_colStride}; + const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows || inputRows[1] < 0 || inputCol >= m_inputCols || inputCol < 0) + { + // Partial packet is all zeros + return internal::pset1<Packet>(Scalar(0)); + } + else if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) + { + // From inputIndex-span[0], we need to load elements starting from index + // span[0] all the way upto (and including) span[1]. + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = + depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.template partialPacket<Packet>(inputIndex - span[0], + mask<Packet>(span[0], span[1] + 1)); + } + else + { + // Using slow path for this partial packet. + // We need to load elements starting from index span[0] all the way upto + // (and including) span[1]. We split this load into 3 parts: + // 0 : span[0]-1 - Zeros will be loaded for these indices + // span[0] : span[1] - Elements will be loaded here for these indices + // span[1]+1 : packetSize-1 - Zeross will be loaded for these indices + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX + typename internal::remove_const<Scalar>::type values[packetSize]; + for (int i = 0; i < span[0]; ++i) + values[i] = Scalar(0); + for (int i = span[0]; i < span[1] + 1; ++i) + values[i] = loadCoeff(patchId - span[0] + i, rowIndex, colIndex, otherIndex); + for (int i = span[1] + 1; i < packetSize; ++i) + values[i] = Scalar(0); + return internal::pload<Packet>(values); + } + } + + // Helper function to load a packet that is split across two columns. + // If required, this function is called from loadPacketStandard() when the + // packet type supports masked load and when the partial packet load is + // available in the TensorEvaluator. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromTwoColumns(Index patchId, Index rowIndex, + Index colIndex, Index otherIndex, + const Index patchOffsets[], + const Index colOffsets[]) const + { + eigen_assert(colOffsets[1] == colOffsets[0] + 1); + const Index packetSize = internal::unpacket_traits<Packet>::size; + + // Packet to load will be split into 2 parts where each part spans a single + // column. First determine where to split. + const Index patchIdSplit = ((colOffsets[1] * m_colStride) * m_rowInputStride) - 1; + const Index patchOffsetSplit = patchIdSplit / m_fastDimZero; + + // patchIds[i]: patchId corresponding to partial packet i + // spans[i]: Start and end indices corresponding to the elements + // to be loaded for partial packet i + // patchOffsets2Cols[i]: patchOffsets corresponding to partial packet i + const Index patchIds[2] = {patchId, patchIdSplit + 1}; + const Index spans[2][2] = {{0, patchIdSplit - patchId}, + {patchIdSplit - patchId + 1, packetSize - 1}}; + const Index patchOffsets2Cols[2][2] = {{patchOffsets[0], patchOffsetSplit}, + {patchOffsetSplit + 1, patchOffsets[1]}}; + + // Load partial packets and do bit-wise OR to generate required packet + return internal::por<Packet>( + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0], + patchOffsets2Cols[0], colOffsets[0]), + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1], + patchOffsets2Cols[1], colOffsets[1])); + } + + // Helper function to load a packet that is present in a single columns. + // If required, this function is called from loadPacketStandard(). + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketStandardFromSingleColumn(Index patchId, Index rowIndex, + Index colIndex, Index otherIndex, + const Index patchOffsets[], + const Index colOffsets[], + const Index inputCols[]) const + { + eigen_assert(colOffsets[0] == colOffsets[1]); + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride, + patchOffsets[1] - colOffsets[1] * m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]}; + + if (inputRows[0] >= m_inputRows || inputRows[1] < 0) + { + // all zeros + return internal::pset1<Packet>(Scalar(0)); // all zeros + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) + { + // no padding + const Index depth = patchId - patchOffsets[0] * patchDepth(); + const Index inputIndex = + depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + // Load standard packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + // This function will be called if partial packet loading is not available + // for the TensorEvaluator or if the packet type does not support masked + // load. + template <typename PacketT, typename TensorEvaluatorT> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< + !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const + { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols); + + eigen_assert(!nonStandardPatches()); + + if ((patchDepth() % packetSize) == 0) + { + return loadPacketFast(patchId, rowIndex, colIndex, otherIndex); + } + + // Offsets and input calculation here are identical to + // loadCoeffStandard(...), but repeated twice. + const Index patchOffsets[2] = {patchId / m_fastDimZero, + (patchId + packetSize - 1) / m_fastDimZero}; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]}; + + if (inputCols[0] >= m_inputCols || inputCols[1] < 0) + { + // all zeros + return internal::pset1<Packet>(Scalar(0)); + } + if (inputCols[0] == inputCols[1]) + { + return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex, otherIndex, + patchOffsets, colOffsets, inputCols); + } + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + // Load standard packet from a patch specified by the "within patch offset" + // (patchId) and the precomputed indices of the first element of the patch. + // This function will be called if partial packet loading is available for + // the TensorEvaluator and if the packet type supports masked load. + // The only difference between this and the other case is that if the packet + // to load is split across two columns, then in this case instead of going to + // the slow (element-by-element) load, we load two packets - each containing + // elements from one of the columns (rest of the elements of the packets are + // zeroes), and then combine these two packets to generate the required + // packet. The idea is to enable fast load (if possible) of these 'partial' + // packets. + template <typename PacketT, typename TensorEvaluatorT> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const + { + const Index packetSize = internal::unpacket_traits<PacketT>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols); + + eigen_assert(!nonStandardPatches()); + + if ((patchDepth() % packetSize) == 0) + { + return loadPacketFast(patchId, rowIndex, colIndex, otherIndex); + } + + // Offsets and input calculation here are identical to + // loadCoeffStandard(...), but repeated twice. + const Index patchOffsets[2] = {patchId / m_fastDimZero, + (patchId + packetSize - 1) / m_fastDimZero}; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]}; + + if (inputCols[0] >= m_inputCols || inputCols[1] < 0) + { + // all zeros + return internal::pset1<PacketT>(Scalar(0)); + } + if (inputCols[0] == inputCols[1]) + { + return loadPacketStandardFromSingleColumn(patchId, rowIndex, colIndex, otherIndex, + patchOffsets, colOffsets, inputCols); + } + if (inputCols[1] == inputCols[0] + 1) + { + return loadPacketStandardFromTwoColumns(patchId, rowIndex, colIndex, otherIndex, patchOffsets, + colOffsets); + } + return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex, + Index otherIndex) const + { + const Index packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(patchId < patchDepth() * patchRows() * m_patch_cols); + + eigen_assert(!nonStandardPatches()); + eigen_assert((patchDepth() % packetSize) == 0); + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = patchId / m_fastDimZero; + eigen_assert((patchId + packetSize - 1) / m_fastDimZero == patchOffset); + + const Index colOffset = patchOffset / m_fastColStride; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputCol = colIndex + colOffset; + const Index inputRow = rowIndex + rowOffset; + if (inputCol < 0 || inputRow < 0 || inputCol >= m_inputCols || inputRow >= m_inputRows) + { + // all zeros + return internal::pset1<Packet>(Scalar(0)); + } + // no padding + const Index depth = patchId - patchOffset * patchDepth(); + const Index inputIndex = + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + return m_impl.template packet<Unaligned>(inputIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, + Index colIndex, + Index otherIndex) const + { + const int packetSize = internal::unpacket_traits<Packet>::size; + EIGEN_ALIGN_MAX + typename internal::remove_const<Scalar>::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) + { + values[i] = loadCoeff(patchId + i, rowIndex, colIndex, otherIndex); + } + Packet rslt = internal::pload<Packet>(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void + computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const + { + const size_t NumInputDims = + array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; + const Index patch2DIndex = + (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); + otherIndex *= m_patchInputStride; + colIndex = patch2DIndex / m_fastOutputRows; + rowIndex = patch2DIndex - colIndex * m_outputRows; + colIndex = colIndex * m_col_strides - m_colPaddingLeft; + rowIndex = rowIndex * m_row_strides - m_rowPaddingTop; + } + + Index m_patch_cols; // number of columns in the patch + Index m_num_patches; // number of patches to extract. + + // Strides for navigating through the single patch. + Index m_patch_row_stride; + Index m_patch_col_stride; + internal::TensorIntDivisor<Index> m_fastPatchRowStride; + internal::TensorIntDivisor<Index> m_fastPatchColStride; + + Index m_patch_row_inflate_strides; // the strides for row inflation in the + // image patch + Index m_patch_col_inflate_strides; // the strides for col inflation in the + // image patch + // Fast representation of inflation strides. + internal::TensorIntDivisor<Index> m_fastInputRowStride; + internal::TensorIntDivisor<Index> m_fastInputColStride; + + Index m_otherStride; + Index m_colStride; + internal::TensorIntDivisor<Index> m_fastNumPatches; + internal::TensorIntDivisor<Index> m_fastColStride; + + Index m_rowInputStride; // row stride in the input tensor + Index m_colInputStride; // col stride in the input tensor + Index m_patchInputStride; // patch stride in the input tensor + + Index m_inputRows; // Number of rows in the input tensor + Index m_inputCols; // Number of cols in the input tensor + + Index m_outputRows; // Number of convolution output rows + Index m_outputCols; // Number of convolution output column + + Index m_row_strides; // User specified row stride + Index m_col_strides; // User specified col stride + + Index m_in_row_strides; // User specified input row stride + Index m_in_col_strides; // User specified input col stride + + Index m_rowPaddingTop; // Row padding + Index m_colPaddingLeft; // Column padding + + internal::TensorIntDivisor<Index> m_fastOutputRows; + internal::TensorIntDivisor<Index> m_fastDimZero; + + const TensorEvaluator<ArgType, Device> m_impl; +}; + +template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device, + typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> +class TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> +{ +public: + typedef typename packet_traits<Scalar>::type Packet; + typedef typename packet_traits<Scalar>::half HalfPacket; + + typedef TensorContractionInputMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + ParentMapper; + + typedef TensorContractionSubMapper< + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; + + typedef Self LinearMapper; + + typedef typename ParentMapper::TensorEvaluatorT TensorEvaluatorT; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper, + Index vert_offset, + Index horiz_offset) + : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper) + { + m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper, + Index vert_offset, + Index horiz_offset) + : m_depth_offset(vert_offset + base_mapper.m_depth_offset), + m_col_offset(horiz_offset + base_mapper.m_col_offset), + m_base_mapper(base_mapper.m_base_mapper) + { + m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const + { + return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const + { + return m_base_mapper(i + m_depth_offset, j + m_col_offset); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const + { + return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const + { + return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset, j + m_col_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const + { + return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex, + m_otherIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const + { + return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const + { + typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT; + return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>( + i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + } + template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool nonStandardPatches() const { return m_base_mapper.nonStandardPatches(); } + + // Max(Col|Row|Depth): compute the upper limit for the column, row and depth + // index respectively that fits into the peeled_k elements starting at + // m_depth_offset. + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const + { + const Index max_col = + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride(); + return std::min<Index>(1 + max_col, patchCols()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const + { + const Index max_row = + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) / + fastPatchRowStride(); + return std::min<Index>(1 + max_row, patchRows()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col, Index row) const + { + const Index max_depth = m_depth_offset + peeled_k - // + col * patchColStride() - // + row * patchRowStride(); + return std::min<Index>(max_depth, patchDepth()); + } + + // MaxDepth uses only the remaining number of elements in the peeled_k. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements, const Index start_depth) const + { + return std::min<Index>(start_depth + num_elements, patchDepth()); + } + + // Every register matters in this code, so sometimes to prevent register + // spilling, instead of the variable that you would expect to see, we use + // another one, that is guaranteed to have the same value. E.g. patch depth is + // always the same as input depth, and it's also the same as input row stride. + // Bunch of other parameters have similar relations. + + typedef internal::TensorIntDivisor<Index> IndexDivisor; + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchRowStride() const + { + eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride && + "Patch depth must be equal to patch row stride."); + return patchDepth(); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index patchColStride() const { return m_base_mapper.m_patch_col_stride; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchRowStride() const + { + eigen_assert(patchDepth() == m_base_mapper.m_patch_row_stride && + "Patch depth must be equal to patch row stride."); + return m_base_mapper.m_fastDimZero; // patch_depth + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE IndexDivisor fastPatchColStride() const + { + return m_base_mapper.m_fastPatchColStride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const + { + const Index inputIndex = depth + baseIndex; + return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth, const Index baseIndex) const + { + const Index inputIndex = depth + baseIndex; + return m_base_mapper.m_impl.coeff(inputIndex); + } + template <typename PacketT = Packet> + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const + { + const Index inputIndex = depth + baseIndex; + return m_base_mapper.m_impl.template partialPacket<PacketT>(inputIndex, + mask<PacketT>(0, num_coeffs)); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool hasPadding() const + { + // TODO(ezhulenev): It does seems that for inflated filter it's still + // possible to guarantee "no padding or skipping" for non-standard packing. + if (nonStandardPatches()) + return true; + + // Non zero padding before. + if (m_base_mapper.m_rowPaddingTop > 0) + return true; + if (m_base_mapper.m_colPaddingLeft > 0) + return true; + + // Non zero padding after in rows. + const Index last_row = (m_base_mapper.m_outputRows - 1) * m_base_mapper.m_row_strides; + if (last_row + (patchRows() - 1) >= m_base_mapper.m_inputRows) + return true; + + // Non zero padding after in cols. + const Index last_col = (m_base_mapper.m_outputCols - 1) * m_base_mapper.m_col_strides; + if (last_col + (patchCols() - 1) >= m_base_mapper.m_inputCols) + return true; + + return false; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padRow(const Index row) const + { + const Index r = m_rowIndex + row; + return r < 0 || r >= m_base_mapper.m_inputRows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row, const Index last_row) const + { + return m_rowIndex + first_row < 0 || m_rowIndex + last_row >= m_base_mapper.m_inputRows; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padOrSkipRow(const Index row, Index *orig_row) const + { + eigen_assert(nonStandardPatches()); + + const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides; + *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1) + ? input_row + : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0); + + return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) || + (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padCol(const Index col) const + { + const Index c = m_colIndex + col; + return c < 0 || c >= m_base_mapper.m_inputCols; + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE bool padOrSkipCol(const Index col, Index *orig_col) const + { + eigen_assert(nonStandardPatches()); + + const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides; + *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1) + ? input_col + : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0); + + return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) || + (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides); + } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const + { + const Index r = m_rowIndex + row; + const Index c = m_colIndex + col; + return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex; + } + // Compute a base index when original input row and column were precomputed + // using padOrSkipRow and padOrSkipCol. Used only for non standard patches. + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index origBaseIndex(const Index orig_row, const Index orig_col) const + { + return orig_row * m_base_mapper.m_rowInputStride + orig_col * m_base_mapper.m_colInputStride + + m_otherIndex; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index rowStride() const { return m_base_mapper.m_row_strides; } + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index colStride() const { return m_base_mapper.m_col_strides; } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index rowOffset() const + { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + return patchOffset - colOffset * m_base_mapper.m_colStride; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index colOffset() const + { + const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero; + const Index colOffset = patchOffset / m_base_mapper.m_fastColStride; + return colOffset; + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Index depthOffset() const { return m_depth_offset % patchDepth(); } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const + { + return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset); + } + +private: + Index m_depth_offset; // First row in the input matrix + Index m_col_offset; // First col in the input matrix + + // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base + // indices for the first element in a patch specified by col_offset + // (see computeBaseIndices(...) for details). + Index m_rowIndex; + Index m_colIndex; + Index m_otherIndex; + + const ParentMapper m_base_mapper; // Keeping a copy instead of a reference + // performs better in benchmarks. +}; + +// Arrange a block of the right input matrix (in our case it's always a "virtual +// matrix" constructed from extracted image patches) in contiguous memory. +// +// Given column major input (A0 beside A1 in memory): +// A0 B0 C0 D0 E0 F0 G0 H0 ... Z0 +// A1 B1 C1 D1 E1 F1 G1 H1 ... Z1 +// A2 B2 C2 D2 E2 F2 G2 H2 ... Z2 +// A3 B3 C3 D3 E3 F3 G3 H3 ... Z3 +// A4 B4 C4 D4 E4 F4 G4 H4 ... Z4 +// A5 B5 C5 D5 E5 F5 G5 H5 ... Z5 +// A6 B6 C6 D6 E6 F6 G6 H6 ... Z6 +// A7 B7 C7 D7 E7 F7 G7 H7 ... Z7 +// A8 ... +// ... +// +// *) A, B, C, ... - patches extracted from the original input. +// *) A0, A1, A2 ... - values from the same patch at different offsets. +// +// The traversal (packed rhs memory) order (B0 besides A0 in memory): +// A0 B0 C0 D0 A1 B1 C1 D1 ... +// E0 F0 G0 H0 E1 F1 G1 H1 ... +// ... +// Z0 Z1 Z2 Z3 Z4 Z5 Z6 Z7 ... <- doesn't belong to any block (nr = 4) +// +// This traversal order must be the same as in default gemm_pack_rhs defined in +// GeneralBlockPanelKernel.h. +// +// *) nr - number of registers along the 'n' dimension. +// See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix +// Multiplication" paper. +template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device, + typename Scalar, typename Index, typename nocontract_t, typename contract_t, + int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, + int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, + Alignment>, + nr, ColMajor, false, false> +{ + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; + typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols, + Index stride = 0, Index offset = 0) const + { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + (void)stride; + (void)offset; + + const Index packet_cols4 = (cols / 4) * 4; + const Index peeled_k = (depth / packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if ((packet_size % 4) == 0 && !non_standard_patches) + { + // FAST PATH: + // Iterate over patch columns and rows, if we know that a single + // packet do not span across multiple rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) + { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) + { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + + // Check if we can squeeze reads along the `row` and `depth` + // dimensions (two innermost dimensions). + if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 && // + !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) && // + !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) && // + !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) && // + !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) + { + // Compute how many elements we can squeeze read. + const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0; + + // Upper bound for the number of elements in the depth dimension + // that we can squeeze read. + const Index squeeze_length = (max_row - start_row) * rhs.patchDepth() - start_depth; + + // Do not overshoot beyond the block size. + const Index max_depth = start_depth + std::min<Index>(peeled_k - k, squeeze_length); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + const Index idx0 = dm0.baseIndex(start_row, c); + const Index idx1 = dm1.baseIndex(start_row, c); + const Index idx2 = dm2.baseIndex(start_row, c); + const Index idx3 = dm3.baseIndex(start_row, c); + + for (Index d = start_depth; d < max_depth; d += packet_size) + { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = rhs.packetNoPadding(d, idx0); + kernel.packet[1] = rhs.packetNoPadding(d, idx1); + kernel.packet[2] = rhs.packetNoPadding(d, idx2); + kernel.packet[3] = rhs.packetNoPadding(d, idx3); + ptranspose(kernel); + pstoreu(block + 0 * packet_size, kernel.packet[0]); + pstoreu(block + 1 * packet_size, kernel.packet[1]); + pstoreu(block + 2 * packet_size, kernel.packet[2]); + pstoreu(block + 3 * packet_size, kernel.packet[3]); + block += 4 * packet_size; + k += packet_size; + } + + // Go to the next column. + continue; + } + + // If we can't squeeze reads, process rows one by one. + for (Index r = start_row; r < max_row; ++r) + { + eigen_assert(k <= peeled_k); + + const bool pad0 = pad_col0 || dm0.padRow(r); + const bool pad1 = pad_col1 || dm1.padRow(r); + const bool pad2 = pad_col2 || dm2.padRow(r); + const bool pad3 = pad_col3 || dm3.padRow(r); + + const Index idx0 = dm0.baseIndex(r, c); + const Index idx1 = dm1.baseIndex(r, c); + const Index idx2 = dm2.baseIndex(r, c); + const Index idx3 = dm3.baseIndex(r, c); + + const Index start_depth = + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) + { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx0); + kernel.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx1); + kernel.packet[2] = pad2 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx2); + kernel.packet[3] = pad3 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx3); + ptranspose(kernel); + pstoreu(block + 0 * packet_size, kernel.packet[0]); + pstoreu(block + 1 * packet_size, kernel.packet[1]); + pstoreu(block + 2 * packet_size, kernel.packet[2]); + pstoreu(block + 3 * packet_size, kernel.packet[3]); + block += 4 * packet_size; + k += packet_size; + } + } + } + + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + } + else + { + for (; k < peeled_k; k += packet_size) + { + PacketBlock<Packet, 4> kernel; + kernel.packet[0] = dm0.loadPacketStandard(k); + kernel.packet[1] = dm1.loadPacketStandard(k); + kernel.packet[2] = dm2.loadPacketStandard(k); + kernel.packet[3] = dm3.loadPacketStandard(k); + ptranspose(kernel); + pstoreu(block + 0 * packet_size, kernel.packet[0]); + pstoreu(block + 1 * packet_size, kernel.packet[1]); + pstoreu(block + 2 * packet_size, kernel.packet[2]); + pstoreu(block + 3 * packet_size, kernel.packet[3]); + block += 4 * packet_size; + } + } + } + + // Copy the remaining coefficients of the column block after the peeled_k. + if (!rhs.nonStandardPatches()) + { + for (; k < depth; k++) + { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } + else + { + for (; k < depth; k++) + { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // copy the remaining columns one at a time (nr==1) + for (Index j2 = packet_cols4; j2 < cols; ++j2) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) + { + *block = dm0(k); + block += 1; + } + } + } +}; + +// Template specialization for packet_size = 2. We must special-case packet +// blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>. +template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device, + typename Scalar, typename Index, typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> +{ + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; + typedef SubMapper DataMapper; + typedef typename packet_traits<Scalar>::type Packet; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols, + Index stride = 0, Index offset = 0) const + { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + (void)stride; + (void)offset; + + const int packet_size = 2; + const Index packet_cols4 = (cols / 4) * 4; + const Index peeled_k = (depth / packet_size) * packet_size; + const bool non_standard_patches = rhs.nonStandardPatches(); + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k = 0; + if (!non_standard_patches) + { + // FAST PATH: + // Iterate over patch columns and rows if we know that a single + // packet do not span across multiple rows or columns. + if ((rhs.patchDepth() % packet_size) == 0) + { + const Index start_col = rhs.colOffset(); + const Index max_col = rhs.maxCol(peeled_k); + + for (Index c = start_col; c < max_col; ++c) + { + eigen_assert(k <= peeled_k); + + const Index start_row = (c == start_col) ? rhs.rowOffset() : 0; + const Index max_row = rhs.maxRow(peeled_k, c); + + const bool pad_col0 = dm0.padCol(c); + const bool pad_col1 = dm1.padCol(c); + const bool pad_col2 = dm2.padCol(c); + const bool pad_col3 = dm3.padCol(c); + + // We can squeeze reads along the `row` and `depth` dimensions if + // the row stride is `1`, which means that `row` and `depth` + // dimensions are contiguous (two innermost dimensions). + if (rhs.rowStride() == 1 && // + !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 && // + !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) && // + !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) && // + !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) && // + !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) + { + // Compute how many elements we can squeeze read. + const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0; + + // Upper bound for the number of elements in the depth dimension + // that we can squeeze read. + const Index squeeze_length = (max_row - start_row) * rhs.patchDepth() - start_depth; + + // Do not overshoot beyond the block size. + const Index max_depth = start_depth + std::min<Index>(peeled_k - k, squeeze_length); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + const Index idx0 = dm0.baseIndex(start_row, c); + const Index idx1 = dm1.baseIndex(start_row, c); + const Index idx2 = dm2.baseIndex(start_row, c); + const Index idx3 = dm3.baseIndex(start_row, c); + + for (Index d = start_depth; d < max_depth; d += packet_size) + { + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = rhs.packetNoPadding(d, idx0); + kernel0.packet[1] = rhs.packetNoPadding(d, idx1); + kernel1.packet[0] = rhs.packetNoPadding(d, idx2); + kernel1.packet[1] = rhs.packetNoPadding(d, idx3); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + k += packet_size; + } + + // Go to the next column. + continue; + } + + // If we can't squeeze reads, process rows one by one. + for (Index r = start_row; r < max_row; ++r) + { + eigen_assert(k <= peeled_k); + + const bool pad0 = pad_col0 || dm0.padRow(r); + const bool pad1 = pad_col1 || dm1.padRow(r); + const bool pad2 = pad_col2 || dm2.padRow(r); + const bool pad3 = pad_col3 || dm3.padRow(r); + + const Index idx0 = dm0.baseIndex(r, c); + const Index idx1 = dm1.baseIndex(r, c); + const Index idx2 = dm2.baseIndex(r, c); + const Index idx3 = dm3.baseIndex(r, c); + + const Index start_depth = + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); + eigen_assert((max_depth - start_depth) % packet_size == 0); + + for (Index d = start_depth; d < max_depth; d += packet_size) + { + eigen_assert(k < peeled_k); + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = pad0 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx0); + kernel0.packet[1] = pad1 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx1); + kernel1.packet[0] = pad2 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx2); + kernel1.packet[1] = pad3 ? pset1<Packet>(Scalar(0)) : rhs.packetNoPadding(d, idx3); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + k += packet_size; + } + } + } + + // The loop above should fill peeled_k elements. + eigen_assert(peeled_k == k); + } + else + { + // Packet can span multiple rows or columns, so we have to go + // though the slower "standard" path. + for (; k < peeled_k; k += packet_size) + { + PacketBlock<Packet, 2> kernel0; + PacketBlock<Packet, 2> kernel1; + kernel0.packet[0] = dm0.loadPacketStandard(k); + kernel0.packet[1] = dm1.loadPacketStandard(k); + kernel1.packet[0] = dm2.loadPacketStandard(k); + kernel1.packet[1] = dm3.loadPacketStandard(k); + ptranspose(kernel0); + ptranspose(kernel1); + pstoreu(block + 0 * packet_size, kernel0.packet[0]); + pstoreu(block + 1 * packet_size, kernel1.packet[0]); + pstoreu(block + 2 * packet_size, kernel0.packet[1]); + pstoreu(block + 3 * packet_size, kernel1.packet[1]); + block += 4 * packet_size; + } + } + } + + // Copy the remaining coefficients of the column block after the peeled_k. + if (!non_standard_patches) + { + for (; k < depth; k++) + { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } + else + { + for (; k < depth; k++) + { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // Copy the remaining columns one at a time (nr==1). + for (Index j2 = packet_cols4; j2 < cols; ++j2) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) + { + *block = dm0(k); + block += 1; + } + } + } +}; + +// Special case for non-vectorized types such as float16. +template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typename Device, + typename Scalar, typename Index, typename nocontract_t, typename contract_t, + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> +struct gemm_pack_rhs< + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> +{ + typedef TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, + Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; + typedef SubMapper DataMapper; + + EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_DEVICE_FUNC + EIGEN_DONT_INLINE void operator()(Scalar *block, const DataMapper &rhs, Index depth, Index cols, + Index stride = 0, Index offset = 0) const + { + eigen_assert(stride == 0); + eigen_assert(offset == 0); + + (void)offset; + (void)stride; + + const Index packet_cols4 = (cols / 4) * 4; + + for (Index j2 = 0; j2 < packet_cols4; j2 += 4) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + if (!rhs.nonStandardPatches()) + { + for (Index k = 0; k < depth; k++) + { + block[0] = dm0.loadCoeffStandard(k); + block[1] = dm1.loadCoeffStandard(k); + block[2] = dm2.loadCoeffStandard(k); + block[3] = dm3.loadCoeffStandard(k); + block += 4; + } + } + else + { + for (Index k = 0; k < depth; k++) + { + block[0] = dm0(k); + block[1] = dm1(k); + block[2] = dm2(k); + block[3] = dm3(k); + block += 4; + } + } + } + + // Copy the remaining columns one at a time (nr==1). + for (Index j2 = packet_cols4; j2 < cols; ++j2) + { + const SubMapper dm0 = rhs.getLinearMapper(0, j2); + for (Index k = 0; k < depth; k++) + { + *block = dm0(k); + block += 1; + } + } + } +}; +} // end namespace internal + +/** SpatialConvolution + * \ingroup CXX11_NeuralNetworks_Module + * + * \brief Applies a 2D convolution over a multichannel input image. + * + * The input parameter is expected to be a tensor with a rank of 3 or more + * (channels, height, width, and optionally others) + * The kernel parameter is expected to be a 4D tensor (filters, channels, + * kernel_height, kernel_width) + * The input and the kernel must both be in col-major layout. The result will + * also be in col-major layout. + * + * If col_in_stride, row_in_stride > 1, then applies convolution with holes + * (aka atrous convolution), sampling every col_in_stride, row_in_stride input + * pixels. + * + * If padding_top, padding_bottom, padding_left, or padding_right is specified, + * then those paddings will be used to pad the input, and padding_type must be + * PADDING_VALID. + * + * The result can be assigned to a tensor of rank equal to the rank of the + * input. The dimensions of the result will be filters, height, width (and + * others if applicable). + * + * It is possible to swap the order of the width and height dimensions provided + * that the same order is used in the input, the kernel, and the output. + * + * It is also possible to add an output kernel to the contraction, output + * kernel is called by Eigen when it "finalizes" the block of an output tensor. + * + */ +template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional< + internal::traits<Input>::Layout == ColMajor, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const OutputKernel>>, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, + internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const OutputKernel>>>::type +SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1, + const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME, + const Index row_in_stride = 1, const Index col_in_stride = 1, + const OutputKernel &output_kernel = OutputKernel(), Index padding_top = 0, + Index padding_bottom = 0, Index padding_left = 0, Index padding_right = 0) +{ + typedef typename internal::traits<Input>::Index TensorIndex; + TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, + internal::traits<Input>::Layout, TensorIndex>> + in(input); + TensorRef< + Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, + internal::traits<Kernel>::Layout, TensorIndex>> + kern(kernel); + + EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, + YOU_MADE_A_PROGRAMMING_MISTAKE) + const bool isColMajor = (internal::traits<Input>::Layout == ColMajor); + + const int NumDims = internal::traits<Input>::NumDimensions; + + // Number of filters to apply. This is the same as the output depth of the + // result + const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3]; + // Number of channels. This is the same as the input depth. + const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2]; + const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1]; + const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0]; + + const Index kernelRowsEff = kernelRows + (kernelRows - 1) * (row_in_stride - 1); + const Index kernelColsEff = kernelCols + (kernelCols - 1) * (col_in_stride - 1); + + array<IndexPair<TensorIndex>, 1> contract_dims; + contract_dims[0] = IndexPair<TensorIndex>(1, 0); + + const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2); + const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3); + const bool padding_explicit = (padding_top || padding_bottom || padding_left || padding_right); + + TensorIndex out_height; + TensorIndex out_width; + switch (padding_type) + { + case PADDING_VALID: + { + const TensorIndex InputRowsEff = InputRows + padding_top + padding_bottom; + const TensorIndex InputColsEff = InputCols + padding_left + padding_right; + out_height = divup(InputRowsEff - kernelRowsEff + 1, row_stride); + out_width = divup(InputColsEff - kernelColsEff + 1, col_stride); + break; + } + case PADDING_SAME: + { + eigen_assert(!padding_explicit); + out_height = divup(InputRows, row_stride); + out_width = divup(InputCols, col_stride); + break; + } + default: + { + // Initialize unused variables to avoid a compiler warning + out_height = 0; + out_width = 0; + eigen_assert(false && "unexpected padding"); + } + } + + // Molds the output of the patch extraction code into a 2d tensor: + // - the first dimension (dims[0]): the patch values to be multiplied with the + // kernels + // - the second dimension (dims[1]): everything else + DSizes<TensorIndex, 2> pre_contract_dims; + if (isColMajor) + { + pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[1] = out_height * out_width; + for (int i = 3; i < NumDims; ++i) + { + pre_contract_dims[1] *= in.dimension(i); + } + } + else + { + pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols; + pre_contract_dims[0] = out_height * out_width; + for (int i = 0; i < NumDims - 3; ++i) + { + pre_contract_dims[0] *= in.dimension(i); + } + } + + // Molds the output of the contraction into the shape expected by the used + // (assuming this is ColMajor): + // - 1st dim: kernel filters + // - 2nd dim: output height + // - 3rd dim: output width + // - 4th dim and beyond: everything else including batch size + DSizes<TensorIndex, NumDims> post_contract_dims; + if (isColMajor) + { + post_contract_dims[0] = kernelFilters; + post_contract_dims[1] = out_height; + post_contract_dims[2] = out_width; + for (int i = 3; i < NumDims; ++i) + { + post_contract_dims[i] = in.dimension(i); + } + } + else + { + post_contract_dims[NumDims - 1] = kernelFilters; + post_contract_dims[NumDims - 2] = out_height; + post_contract_dims[NumDims - 3] = out_width; + for (int i = 0; i < NumDims - 3; ++i) + { + post_contract_dims[i] = in.dimension(i); + } + } + + DSizes<TensorIndex, 2> kernel_dims; + if (isColMajor) + { + kernel_dims[0] = kernelFilters; + kernel_dims[1] = kernelChannels * kernelRows * kernelCols; + } + else + { + kernel_dims[0] = kernelChannels * kernelRows * kernelCols; + kernel_dims[1] = kernelFilters; + } + if (padding_explicit) + { + return choose( + Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, + padding_bottom, padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches( + kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); + } + else + { + return choose( + Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, padding_type) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride, + col_in_stride, padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); + } +} + +} // end namespace Eigen + +#endif // __NNFW_CKER_EIGEN_EIGEN_SPATIAL_CONVOLUTIONS_INL_H__ diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h new file mode 100644 index 000000000..c6f1e2ee7 --- /dev/null +++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__ +#define __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__ + +//#define EIGEN_USE_CUSTOM_THREAD_POOL +#define EIGEN_USE_THREADS +#include "unsupported/Eigen/CXX11/Tensor" + +// Note the following header is used in both TF and TFLite. Particularly, it's +// used for float TFLite Conv2D. +#include "cker/eigen/eigen_spatial_convolutions-inl.h" + +#endif // __NNFW_CKER_EGIEN_EIGEN_SPATIAL_CONVOLUTIONS_H__ diff --git a/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h b/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h new file mode 100644 index 000000000..87697e240 --- /dev/null +++ b/compute/cker/include/cker/eigen/eigen_tensor_reduced_instantiations_oss.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This is essentially unsupported/CXX11/Eigen/Tensor.h +// TODO(petewarden) - move this to a common location in Eigen itself. + +// clang-format off + + +#ifndef __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__ +#define __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__ + + +#include "Eigen/Core" + +#if defined(EIGEN_USE_SYCL) +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include <CL/sycl.hpp> +#include <iostream> +#include <map> +#include <memory> +#include <utility> +#endif +#include <cmath> +#include <cstddef> +#include <cstring> + + + + + +#ifdef _WIN32 +typedef __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#include <windows.h> +#else +#include <stdint.h> +#include <unistd.h> +#endif + +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 +#include <random> +#endif + +#ifdef _WIN32 +#include <windows.h> +#elif defined(__APPLE__) +#include <mach/mach_time.h> +#else +#include <time.h> +#endif + +#ifdef EIGEN_USE_THREADS +#include "unsupported/Eigen/CXX11/ThreadPool" +#endif + + +#include "Eigen/src/Core/util/DisableStupidWarnings.h" + +#include "unsupported/Eigen/SpecialFunctions" +#include "unsupported/Eigen/CXX11/src/util/CXX11Meta.h" +#include "unsupported/Eigen/CXX11/src/util/MaxSizeVector.h" + + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h" + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h" + +#undef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous && \ + this->m_rhs_inner_dim_contiguous && \ + !this->m_rhs_inner_dim_reordered) { \ + METHOD<true, true, false, ALIGNMENT> ARGS; \ + } else { \ + eigen_assert(false && "Unsupported contraction formats"); \ + } + + +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" +#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h" +#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h" + +#include "Eigen/src/Core/util/ReenableStupidWarnings.h" + + +#endif // __NNFW_CKER_EGIEN_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H__ diff --git a/compute/cker/include/cker/gemmlowp/GEMMSupport.h b/compute/cker/include/cker/gemmlowp/GEMMSupport.h new file mode 100644 index 000000000..76486eded --- /dev/null +++ b/compute/cker/include/cker/gemmlowp/GEMMSupport.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__ +#define __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__ + +#include <public/gemmlowp.h> + +#include <memory> +#include <thread> + +namespace nnfw +{ +namespace cker +{ +namespace gemm_support +{ + +struct GemmContext +{ + std::unique_ptr<gemmlowp::GemmContext> gemm_context; + constexpr static int default_num_threadpool_threads = 4; + + GemmContext() + { + int num_threads = std::thread::hardware_concurrency() / 2; + if (num_threads == 0) + { + num_threads = default_num_threadpool_threads; + } + + gemm_context.reset(new gemmlowp::GemmContext()); + gemm_context->set_max_num_threads(num_threads); + } + + static inline GemmContext &GetGemmLowpContext() + { + static GemmContext instance; + return instance; + } +}; + +inline gemmlowp::GemmContext *GetGemmLowpContext() +{ + auto &ctx = GemmContext::GetGemmLowpContext(); + return ctx.gemm_context.get(); +} + +} // namespace gemm_support +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GEMMLOWP_GEMM_SUPPORT_H__ diff --git a/compute/cker/include/cker/neon/neon_check.h b/compute/cker/include/cker/neon/neon_check.h new file mode 100644 index 000000000..116f01bb7 --- /dev/null +++ b/compute/cker/include/cker/neon/neon_check.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_NEON_CHECK_H__ +#define __NNFW_CKER_NEON_CHECK_H__ + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#include <arm_neon.h> +#endif + +// Disable X86_NEON +// #if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON +#if 0 +#define USE_NEON +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wattributes" +#pragma GCC diagnostic ignored "-Wnarrowing" +#pragma GCC diagnostic ignored "-Wsequence-point" +#include "NEON_2_SSE.h" +#pragma GCC diagnostic pop +#endif + +// NEON_OR_PORTABLE(SomeFunc, args) calls NeonSomeFunc(args) if USE_NEON is +// defined, PortableSomeFunc(args) otherwise. +#ifdef USE_NEON +// Always use Neon code +#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__) + +#else +// No NEON available: Use Portable code +#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__) + +#endif // defined(USE_NEON) + +#endif // __NNFW_CKER_NEON_CHECK_H__ diff --git a/compute/cker/include/cker/operation/AddN.h b/compute/cker/include/cker/operation/AddN.h new file mode 100644 index 000000000..1704da641 --- /dev/null +++ b/compute/cker/include/cker/operation/AddN.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ADDN_H__ +#define __NNFW_CKER_ADDN_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +void AddN(const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data) +{ + const size_t size = input_shape.FlatSize(); + for (size_t i = 0; i < size; ++i) + { + T x = 0; + for (size_t j = 0; j < num_inputs; ++j) + { + x += input_data[j][i]; + } + output_data[i] = x; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ADDN_H__ diff --git a/compute/cker/include/cker/operation/ArgMinMax.h b/compute/cker/include/cker/operation/ArgMinMax.h new file mode 100644 index 000000000..f7a06d74b --- /dev/null +++ b/compute/cker/include/cker/operation/ArgMinMax.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ARGMINMAX_H__ +#define __NNFW_CKER_ARGMINMAX_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T1, typename T2, typename Cmp> +void ArgMinMax(const Shape &input1_shape, const T1 *input1_data, const Shape &output_shape, + T2 *output_data, int32_t axis, const Cmp &cmp) +{ + UNUSED_RELEASE(output_shape); + assert(input1_shape.DimensionsCount() > 0); + assert(input1_shape.DimensionsCount() - 1 == output_shape.DimensionsCount()); + if (axis < 0) + { + axis += input1_shape.DimensionsCount(); + } + const int axis_size = input1_shape.Dims(axis); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + assert(input1_shape.Dims(i) == output_shape.Dims(i)); + outer_size *= input1_shape.Dims(i); + } + + int inner_size = 1; + const int dims_count = input1_shape.DimensionsCount(); + for (int i = axis + 1; i < dims_count; ++i) + { + assert(input1_shape.Dims(i) == output_shape.Dims(i - 1)); + inner_size *= input1_shape.Dims(i); + } + for (int outer = 0; outer < outer_size; ++outer) + { + for (int inner = 0; inner < inner_size; ++inner) + { + auto min_max_value = input1_data[outer * axis_size * inner_size + inner]; + T2 min_max_index = 0; + for (int i = 1; i < axis_size; ++i) + { + const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner]; + if (cmp(curr_value, min_max_value)) + { + min_max_value = curr_value; + min_max_index = static_cast<T2>(i); + } + } + output_data[outer * inner_size + inner] = min_max_index; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ARGMINMAX_H__ diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h new file mode 100644 index 000000000..6149cafa7 --- /dev/null +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_AVERAGE_POOL_H__ +#define __NNFW_CKER_AVERAGE_POOL_H__ + +#include "cker/neon/neon_check.h" +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +// TODO Change to apply neon for this function if it is faster +template <typename T> +void AveragePool(const PoolParams &, const Shape &, const T *, const Shape &, T *) +{ + static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value, + "cker::MaxPool : This function supports only integer or floating point"); + throw std::runtime_error("cker::AveragePool : Unsupported data type"); +} + +template <> +void AveragePool<float>(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + // TODO(benoitjacob) make this a proper reference impl without Eigen! + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // TODO(benoitjacob) get rid of the dynamic memory allocation here! + Eigen::VectorXf out_count(out_mat.cols()); + out_count.setZero(); + // Prefill the output to 0. + out_mat.setZero(); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width)); + out_count(out_offset)++; + } + } + } + } + } + // Divide the output by the actual number of elements being averaged over + assert(out_count.minCoeff() > 0); + out_mat.array().rowwise() /= out_count.transpose().array(); + + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +inline void AveragePool16(const PoolParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, + uint8_t *output_data) +{ + // Here, and in other pooling ops, in order to maintain locality of reference, + // to minimize some recalculations, and to load into NEON vector registers, we + // use an inner loop down the depth. Since depths can be large and hence we + // would need arbitrarily large temporary storage, we divide the work up into + // depth tranches just within the batch loop. + static constexpr int kPoolingAccTrancheSize = 256; + + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + uint16_t acc[kPoolingAccTrancheSize]; + for (int batch = 0; batch < batches; ++batch) + { + // We proceed through the depth in tranches (see comment above). The + // depth_base is the depth at the beginning of the tranche. The + // tranche_depth is the depth dimension of the tranche. + for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize) + { + const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + memset(acc, 0, tranche_depth * sizeof(acc[0])); + const uint8_t *input_ptr = + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + for (int fy = filter_y_start; fy < filter_y_end; fy++) + { + const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); + for (int fx = filter_x_start; fx < filter_x_end; fx++) + { + const uint8_t *input_channel_ptr = input_row_ptr; + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + uint16x8_t acc_reg[2]; + for (int i = 0; i < 2; i++) + { + acc_reg[i] = vld1q_u16(acc + channel + 8 * i); + } + uint8x16_t input_reg = vld1q_u8(input_channel_ptr); + input_channel_ptr += 16; + acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg)); + acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg)); + for (int i = 0; i < 2; i++) + { + vst1q_u16(acc + channel + 8 * i, acc_reg[i]); + } + } + for (; channel <= tranche_depth - 8; channel += 8) + { + uint16x8_t acc_reg = vld1q_u16(acc + channel); + uint8x8_t input_reg = vld1_u8(input_channel_ptr); + input_channel_ptr += 8; + acc_reg = vaddw_u8(acc_reg, input_reg); + vst1q_u16(acc + channel, acc_reg); + } +#endif + for (; channel < tranche_depth; ++channel) + { + acc[channel] += *input_channel_ptr++; + } + input_row_ptr += depth; + } + } + uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base); + int channel = 0; +#ifdef USE_NEON +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) \ + { \ + for (; channel <= tranche_depth - 8; channel += 8) \ + { \ + uint16_t buf[8]; \ + for (int i = 0; i < 8; i++) \ + { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ + } + AVGPOOL_DIVIDING_BY(9) + AVGPOOL_DIVIDING_BY(15) +#undef AVGPOOL_DIVIDING_BY + for (; channel <= tranche_depth - 8; channel += 8) + { + uint16_t buf[8]; + for (int i = 0; i < 8; i++) + { + buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; + } + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); + vst1_u8(output_ptr + channel, buf8); + } +#endif + for (; channel < tranche_depth; ++channel) + { + uint8_t a = (acc[channel] + filter_count / 2) / filter_count; + a = std::max<uint16_t>(a, params.quantized_activation_min); + a = std::min<uint16_t>(a, params.quantized_activation_max); + output_ptr[channel] = static_cast<uint8_t>(a); + } + } + } + } + } +} + +inline void AveragePool32(const PoolParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, + uint8_t *output_data) +{ + + // Here, and in other pooling ops, in order to maintain locality of reference, + // to minimize some recalculations, and to load into NEON vector registers, we + // use an inner loop down the depth. Since depths can be large and hence we + // would need arbitrarily large temporary storage, we divide the work up into + // depth tranches just within the batch loop. + static constexpr int kPoolingAccTrancheSize = 256; + + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + uint32_t acc[kPoolingAccTrancheSize]; + for (int batch = 0; batch < batches; ++batch) + { + // We proceed through the depth in tranches (see comment above). The + // depth_base is the depth at the beginning of the tranche. The + // tranche_depth is the depth dimension of the tranche. + for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize) + { + const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + memset(acc, 0, tranche_depth * sizeof(acc[0])); + const uint8_t *input_ptr = + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + for (int fy = filter_y_start; fy < filter_y_end; fy++) + { + const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); + for (int fx = filter_x_start; fx < filter_x_end; fx++) + { + const uint8_t *input_channel_ptr = input_row_ptr; + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + uint16x4_t acc_reg[4]; + uint8x16_t input_reg = vld1q_u8(input_channel_ptr); + input_channel_ptr += 16; + acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg))); + acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg))); + acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg))); + acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg))); + for (int i = 0; i < 4; i++) + { + vst1q_u32(acc + channel + 4 * i, + vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i])); + } + } + for (; channel <= tranche_depth - 8; channel += 8) + { + uint16x4_t acc_reg[2]; + uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr)); + input_channel_ptr += 8; + acc_reg[0] = vget_low_u16(input_reg); + acc_reg[1] = vget_high_u16(input_reg); + for (int i = 0; i < 2; i++) + { + vst1q_u32(acc + channel + 4 * i, + vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i])); + } + } +#endif + for (; channel < tranche_depth; ++channel) + { + acc[channel] += *input_channel_ptr++; + } + input_row_ptr += depth; + } + } + uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base); + int channel = 0; +#ifdef USE_NEON +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) \ + { \ + for (; channel <= tranche_depth - 8; channel += 8) \ + { \ + uint16_t buf[8]; \ + for (int i = 0; i < 8; i++) \ + { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ + } + AVGPOOL_DIVIDING_BY(9) + AVGPOOL_DIVIDING_BY(15) +#undef AVGPOOL_DIVIDING_BY + for (; channel <= tranche_depth - 8; channel += 8) + { + uint16_t buf[8]; + for (int i = 0; i < 8; i++) + { + buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; + } + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); + buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); + vst1_u8(output_ptr + channel, buf8); + } +#endif + for (; channel < tranche_depth; ++channel) + { + uint16_t a = (acc[channel] + filter_count / 2) / filter_count; + a = std::max<uint16_t>(a, params.quantized_activation_min); + a = std::min<uint16_t>(a, params.quantized_activation_max); + output_ptr[channel] = static_cast<uint8_t>(a); + } + } + } + } + } +} + +template <> +void AveragePool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, + uint8_t *output_data) +{ + if (params.filter_height * params.filter_width > 16 * 16) + { + AveragePool32(params, input_shape, input_data, output_shape, output_data); + } + else + { + AveragePool16(params, input_shape, input_data, output_shape, output_data); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_AVERAGE_POOL_H__ diff --git a/compute/cker/include/cker/operation/BatchMatMul.h b/compute/cker/include/cker/operation/BatchMatMul.h new file mode 100644 index 000000000..18070982a --- /dev/null +++ b/compute/cker/include/cker/operation/BatchMatMul.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BATCH_MATMUL_H__ +#define __NNFW_CKER_BATCH_MATMUL_H__ + +#include "Transpose.h" + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/operation/reference/BatchMatMul.h" + +#include <vector> + +namespace nnfw +{ +namespace cker +{ + +class BatchMatMul +{ +public: + BatchMatMul() + { + // DO NOTHING + } + + /** + * @brief Prepare temporary area for calculation + */ + void prepare(const Shape &lhs_shape, const Shape &rhs_shape, bool adj_x, bool adj_y) + { + if (adj_x) + { + int32_t rank = lhs_shape.DimensionsCount(); + _temp_lhs_shape.Resize(rank); + + for (int32_t i = 0; i < rank - 2; i++) + { + _temp_lhs_shape.SetDim(i, lhs_shape.Dims(i)); + } + _temp_lhs_shape.SetDim(rank - 2, lhs_shape.Dims(rank - 1)); + _temp_lhs_shape.SetDim(rank - 1, lhs_shape.Dims(rank - 2)); + + _temp_lhs.resize(_temp_lhs_shape.FlatSize()); + } + + if (!adj_y) + { + int32_t rank = rhs_shape.DimensionsCount(); + _temp_rhs_shape.Resize(rank); + + for (int32_t i = 0; i < rank - 2; i++) + { + _temp_rhs_shape.SetDim(i, rhs_shape.Dims(i)); + } + _temp_rhs_shape.SetDim(rank - 2, rhs_shape.Dims(rank - 1)); + _temp_rhs_shape.SetDim(rank - 1, rhs_shape.Dims(rank - 2)); + + _temp_rhs.resize(_temp_rhs_shape.FlatSize()); + } + } + + void operator()(const Shape &lhs_shape, const float *lhs_data, const Shape &rhs_shape, + const float *rhs_data, bool adj_x, bool adj_y, const Shape &output_shape, + float *output_data) + { + // Assume lhs and rhs is not constant + // TODO Handle constant input + + if (!adj_y) + { + transposeRowsCols(rhs_shape, rhs_data, _temp_rhs_shape, _temp_rhs.data()); + } + + if (adj_x) + { + transposeRowsCols(lhs_shape, lhs_data, _temp_lhs_shape, _temp_lhs.data()); + } + + Shape new_lhs_shape = adj_x ? lhs_shape : swapRowColDims(lhs_shape); + Shape new_rhs_shape = adj_y ? rhs_shape : swapRowColDims(rhs_shape); + const float *new_lhs_data = adj_x ? _temp_lhs.data() : lhs_data; + const float *new_rhs_data = adj_y ? rhs_data : _temp_rhs.data(); + + // Note we pass RHS args first, LHS args second + // Check accumulative dimensions of lhs and rhs of are equal + assert(Shape::ExtendedShape(5, new_rhs_shape).Dims(4) == + Shape::ExtendedShape(5, new_lhs_shape).Dims(3)); + reference::BatchMatMul(new_rhs_shape, new_rhs_data, new_lhs_shape, new_lhs_data, output_shape, + output_data); + } + +private: + Shape swapRowColDims(const Shape &shape) + { + Shape swapped_shape(shape); + const uint32_t dims = shape.DimensionsCount(); + swapped_shape.SetDim(dims - 2, shape.Dims(dims - 1)); + swapped_shape.SetDim(dims - 1, shape.Dims(dims - 2)); + + return swapped_shape; + } + + void transposeRowsCols(const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) + { + TransposeParams params; + int rank = input_shape.DimensionsCount(); + params.perm_count = rank; + for (int i = 0; i < 2; i++) + { + params.perm[i] = i; + } + params.perm[rank - 2] = rank - 1; + params.perm[rank - 1] = rank - 2; + + Transpose<float>(params, input_shape, input_data, output_shape, output_data); + } + +private: + std::vector<float> _temp_lhs; + Shape _temp_lhs_shape; + std::vector<float> _temp_rhs; + Shape _temp_rhs_shape; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BATCH_MATMUL_H__ diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h new file mode 100644 index 000000000..e33b2fba5 --- /dev/null +++ b/compute/cker/include/cker/operation/BatchToSpaceND.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BATCH_TO_SPACE_ND_H__ +#define __NNFW_CKER_BATCH_TO_SPACE_ND_H__ + +#include "cker/Shape.h" + +#define UNUSED(x) ((void)(x)) + +namespace nnfw +{ +namespace cker +{ + +// Helper methods for BatchToSpaceND. +// `spatial_index_dim` specifies post-crop offset index in this spatial +// dimension, i.e. spatial offset introduced by flattening batch to spatial +// dimension minus the crop size at beginning. `block_shape_dim` is the block +// size in current dimension. `input_dim` and `output_dim` are input and output +// size of BatchToSpaceND operation in current dimension. +// Output start index is inclusive and end index is exclusive. +inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim, + int *start_index, int *end_index) +{ + // (*start_index) * block_shape_dim is effectively rounded up to the next + // multiple of block_shape_dim by the integer division. + *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim); + // Similarly, (*end_index) * block_shape_dim is rounded up too (note that + // end_index is exclusive). + *end_index = + std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); +} + +template <typename T> +inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1_data, + const int32_t *block_shape_data, const int32_t *crops_data, + const Shape &unextended_output_shape, T *output_data) +{ + auto input_dim = unextended_input1_shape.DimensionsCount(); + auto output_dim = unextended_output_shape.DimensionsCount(); + + assert(input_dim == 3 || input_dim == 4); + assert(input_dim == output_dim); + + UNUSED(input_dim); + UNUSED(output_dim); + + // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C. + auto extend_shape = [](const Shape &shape) { + if (shape.DimensionsCount() == 4) + { + return shape; + } + Shape new_shape(4, 1); + new_shape.SetDim(0, shape.Dims(0)); + new_shape.SetDim(1, shape.Dims(1)); + new_shape.SetDim(3, shape.Dims(2)); + return new_shape; + }; + const Shape input1_shape = extend_shape(unextended_input1_shape); + const Shape output_shape = extend_shape(unextended_output_shape); + + const int32_t output_width = output_shape.Dims(2); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_batch_size = output_shape.Dims(0); + + const int32_t depth = input1_shape.Dims(3); + const int32_t input_width = input1_shape.Dims(2); + const int32_t input_height = input1_shape.Dims(1); + const int32_t input_batch_size = input1_shape.Dims(0); + + const int32_t block_shape_height = block_shape_data[0]; + const int32_t block_shape_width = block_shape_data[1]; + + const int32_t crops_top = crops_data[0]; + const int32_t crops_left = crops_data[2]; + + for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) + { + const int out_batch = in_batch % output_batch_size; + const int spatial_offset = in_batch / output_batch_size; + + int in_h_start = 0; + int in_h_end = 0; + // GetIndexRange ensures start and end indices are in [0, output_height). + GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height, + output_height, &in_h_start, &in_h_end); + + for (int in_h = in_h_start; in_h < in_h_end; ++in_h) + { + const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top; + assert(out_h >= 0); + assert(out_h < output_height); + + int in_w_start = 0; + int in_w_end = 0; + // GetIndexRange ensures start and end indices are in [0, output_width). + GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width, + output_width, &in_w_start, &in_w_end); + + for (int in_w = in_w_start; in_w < in_w_end; ++in_w) + { + const int out_w = + in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; + assert(out_w >= 0); + assert(out_w < output_width); + T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); + const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0); + memcpy(out, in, depth * sizeof(T)); + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BATCH_TO_SPACE_ND_H__ diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h new file mode 100644 index 000000000..d9917a9da --- /dev/null +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ +#define __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ + +#include <functional> +#include "cker/operation/optimized/BinaryArithmeticOps.h" +#include "cker/operation/reference/BinaryArithmeticOps.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +namespace +{ +template <BinaryArithmeticOpType op_type, typename T> +const std::function<T(const T &, const T &)> GetBinaryArtithmeticFn() +{ + switch (op_type) + { + case BinaryArithmeticOpType::ADD: + { + return [](const T &a, const T &b) -> T { return a + b; }; + } + case BinaryArithmeticOpType::MUL: + { + return [](const T &a, const T &b) -> T { return a * b; }; + } + case BinaryArithmeticOpType::SUB: + { + return [](const T &a, const T &b) -> T { return a - b; }; + } + case BinaryArithmeticOpType::DIV: + { + if (std::is_floating_point<T>::value) + return [](const T &a, const T &b) -> T { return a / b; }; + else + return [](const T &a, const T &b) -> T { + if (b == 0) + throw std::runtime_error("Divide by zero"); + return a / b; + }; + } + case BinaryArithmeticOpType::POW: + { + return [](const T &a, const T &b) -> T { return std::pow(a, b); }; + } + default: + { + assert(false); + return nullptr; + } + } +} +} // namespace + +// Consolidates dimensions in broadcast inputs, checks for five-fold pattern. +// +// For example, if sequence of dimensions of one input is +// ..., 1, 3, 1, 7, 9, 5,... and the other is ..., 2, 3, 1, 7, 1, 1, ... +// we can consolidate these as +// ..., 1, 3*7, 9*5, ... and 2, 3*7, 1. +// +// The category is updated in the less-frequent case of shapes that are +// not suited to a fivefold-loop broadcast. +// +// Falls back to generic pattern when it does not know how to process properly. +// +// Returns true iff there is some sort of broadcast, which includes five-fold +// patterns and falling back to generic broadcast. +inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1, + BinaryArithmeticOpParam *params) +{ + const int dims_count = std::max(shape0.DimensionsCount(), shape1.DimensionsCount()); + + params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast; + Shape scalar_shape(dims_count, 1); + + auto extended_shape0 = Shape::ExtendedShape(dims_count, shape0); + auto extended_shape1 = Shape::ExtendedShape(dims_count, shape1); + + // Check for "exact" match, implicitly accepting any scalar shapes. + if (extended_shape0 == extended_shape1) + { + params->broadcast_category = BroadcastableOpCategory::kNonBroadcast; + return false; + } + + for (int i = dims_count - 1; i >= 0; --i) + { + if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) + { + continue; + } + else if (extended_shape0.Dims(i) == 1) + { + params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast; + break; + } + else if (extended_shape1.Dims(i) == 1) + { + params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast; + break; + } + else + { + // This case is erroneous: there is a dimension that does not match and + // is not a broadcast from one shape to the other. + params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast; + return true; + } + } + + if (params->broadcast_category != BroadcastableOpCategory::kFirstInputBroadcastsFast && + params->broadcast_category != BroadcastableOpCategory::kSecondInputBroadcastsFast) + { + return false; + } + + // From this point it is assumed contractually that corresponding dimensions + // in shape0 and shape1 are either (a) equal or (b) one or other equals 1. + const bool swap_inputs = + params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast; + const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0; + const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1; + + int i = dims_count - 1; + params->broadcast_shape[0] = 1; + params->broadcast_shape[1] = 1; + params->broadcast_shape[2] = 1; + params->broadcast_shape[3] = 1; + params->broadcast_shape[4] = 1; + // y_0 is greedy: include dims if both or neither equal 1: in other words, + // test for equality rather than (shape_a->Dims(i) != 1). + while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) + { + params->broadcast_shape[4] *= shape_b->Dims(i); + --i; + } + // Here either input_a or input_b has dim of 1 (if i >= 0). If it is input_b + // that has the unit dimension, the next two loops are not entered. + while (i >= 0 && shape_a->Dims(i) == 1) + { + params->broadcast_shape[3] *= shape_b->Dims(i); + --i; + } + while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) + { + params->broadcast_shape[2] *= shape_a->Dims(i); + --i; + } + // Here either input_a or input_b has dim of 1 (if i >= 0). + while (i >= 0 && shape_b->Dims(i) == 1) + { + params->broadcast_shape[1] *= shape_a->Dims(i); + --i; + } + while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) + { + params->broadcast_shape[0] *= shape_b->Dims(i); + --i; + } + + // Rarer case is when the broadcast dimensions cannot be handled by a fivefold + // loop. + if (i >= 0) + { + params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast; + } + return true; +} + +template <BinaryArithmeticOpType op_type, typename T> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data) +{ + reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>()); +} + +template <BinaryArithmeticOpType op_type> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const uint8_t *input1_data, const Shape &input2_shape, + const uint8_t *input2_data, const Shape &output_shape, + uint8_t *output_data) +{ + switch (op_type) + { + case nnfw::cker::BinaryArithmeticOpType::ADD: + case nnfw::cker::BinaryArithmeticOpType::SUB: + optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::MUL: + optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, + const_cast<uint8_t *>(input2_data), output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::DIV: + throw std::runtime_error{"Quant8 Asymm NYI"}; + + default: + assert(false); + break; + } +} + +template <BinaryArithmeticOpType op_type> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + // Supported type is only float now + switch (op_type) + { + case nnfw::cker::BinaryArithmeticOpType::ADD: + optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::MUL: + optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::SUB: + optimized::Sub(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::DIV: + optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); + break; + default: + assert(false); + break; + } +} + +template <BinaryArithmeticOpType op_type, typename T> +inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, + T *output_data) +{ + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, + GetBinaryArtithmeticFn<op_type, T>()); +} + +template <BinaryArithmeticOpType op_type> +inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const uint8_t *input1_data, const Shape &input2_shape, + const uint8_t *input2_data, const Shape &output_shape, + uint8_t *output_data) +{ + switch (op_type) + { + case nnfw::cker::BinaryArithmeticOpType::ADD: + case nnfw::cker::BinaryArithmeticOpType::SUB: + optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::MUL: + optimized::BroadcastMulDispatchQuant8( + params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, + const_cast<uint8_t *>(input2_data), output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::DIV: + case nnfw::cker::BinaryArithmeticOpType::POW: + throw std::runtime_error{"Quant8 Asymm NYI"}; + default: + assert(false); + break; + } +} + +template <BinaryArithmeticOpType op_type> +inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + // Supported type is only float now + switch (op_type) + { + case nnfw::cker::BinaryArithmeticOpType::ADD: + optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::MUL: + optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::SUB: + optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::DIV: + optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; + case nnfw::cker::BinaryArithmeticOpType::POW: + reference::BroadcastBinaryArithmeticOpSlow<float>( + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + GetBinaryArtithmeticFn<op_type, float>()); + break; + default: + assert(false); + break; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BINARY_ARITHMETIC_OPS_H__ diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h new file mode 100644 index 000000000..5068eca96 --- /dev/null +++ b/compute/cker/include/cker/operation/BroadcastTo.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BROADCAST_TO_H__ +#define __NNFW_CKER_BROADCAST_TO_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" + +#include "cker/operation/Helper/Tensor.h" +#include "cker/operation/Helper/BCast.h" + +#include <vector> + +#define UNUSED(x) (void)(x) + +namespace nnfw +{ +namespace cker +{ +namespace functor +{ +static const int32_t kint32max = ((int32_t)0x7FFFFFFF); + +template <typename Device, typename T> struct FillFunctor +{ + // Computes on device "d": out = out.constant(in(0)), + void operator()(const Device &d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstScalar in); +}; + +template <typename T> struct FillFunctor<Eigen::ThreadPoolDevice, T> +{ + void operator()(const Eigen::ThreadPoolDevice &d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstScalar in) + { + out.device(d) = out.constant(in()); + } +}; + +template <typename Device, typename T> struct BroadcastTo +{ + template <int NDIMS> + void DoBCast32Bit(const Device &device, typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, + const typename Eigen::array<int, NDIMS> &bcast) const + { + To32Bit(out).device(device) = To32Bit(in).broadcast(bcast); + } + + template <int NDIMS> + void DoBCast(const Device &device, typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, + const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const + { + out.device(device) = in.broadcast(bcast); + } + + template <int NDIMS> + void ReshapeAndBCast(const Device &device, Tensor &output_tensor, const Tensor &input_tensor, + const BCast &bcast) const + { + const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value && + output_tensor.shape.FlatSize() < kint32max && + input_tensor.shape.FlatSize() < kint32max; + if (can_use_32bit) + { + DoBCast32Bit<NDIMS>(device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()), + input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()), + BCast::ToIndexArrayType<int, NDIMS>(bcast.x_bcast())); + } + else + { + DoBCast<NDIMS>(device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()), + input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()), + BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast())); + } + } + + // PRECONDITION: rank(input_shape) > 0 && + // rank(input_shape) <= rank(output_shape) && + // output_shape.num_elements() > 0. + void operator()(const Device &device, Tensor &output_tensor, const Shape &output_shape, + const Tensor &input_tensor, const Shape &input_shape, const BCast &bcast) const + { + const int ndims = bcast.y_reshape().size(); + switch (ndims) + { + case 1: + ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast); + break; + case 2: + ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast); + break; + case 3: + ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast); + break; + case 4: + ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast); + break; + case 5: + ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast); + break; + default: + // NOTE : UNUSED leaves for maintenance purposes. + UNUSED(output_shape); + UNUSED(input_shape); + break; + } + } +}; +} // functor + +template <typename T> +inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape, + T *output_data) +{ + const int input_flatsize = input_shape.FlatSize(); + + if (input_shape == output_shape) + { + memcpy(output_data, input_data, input_flatsize * sizeof(T)); + return; + } + + // Input shape's rank must be no greater than rank of output shape. + assert(input_shape.DimensionsCount() <= output_shape.DimensionsCount()); + + // It shouldn't be 0. + assert(output_shape.DimensionsCount()); + + Tensor output_tensor; + Tensor input_tensor; + + input_tensor.shape.ReplaceWith(input_shape.DimensionsCount(), input_shape.DimsData()); + input_tensor.buffer = input_data; + + output_tensor.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData()); + output_tensor.buffer = output_data; + + const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice(); + + // Handle broadcast from Scalar. + if (input_flatsize == 0) + { + functor::FillFunctor<Eigen::ThreadPoolDevice, T>()(device, output_tensor.flat<T>(), + input_tensor.scalar<T>()); + } + + BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape), + /*fewer_dims_optimization=*/true); + + // Predict TRUE. + assert(bcast.IsValid()); + // should be same. + assert(BCast::ToShape(bcast.output_shape()) == output_shape); + + functor::BroadcastTo<Eigen::ThreadPoolDevice, T>()(device, output_tensor, output_shape, + input_tensor, input_shape, bcast); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BROADCAST_TO_H__ diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h new file mode 100644 index 000000000..d69b38aca --- /dev/null +++ b/compute/cker/include/cker/operation/Common.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_COMMON_H__ +#define __NNFW_CKER_COMMON_H__ + +#include "cker/neon/neon_check.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data, + int array_size, float *array_data) +{ + // Note: see b/132215220: in May 2019 we thought it would be OK to replace + // this with the Eigen one-liner: + // return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max). + // This turned out to severely regress performance: +4ms (i.e. 8%) on + // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now. + assert((array_size % bias_size) == 0); +#ifdef USE_NEON + float *array_ptr = array_data; + float *array_end_ptr = array_ptr + array_size; + const auto clamp_min_vec = vdupq_n_f32(clamp_min); + const auto clamp_max_vec = vdupq_n_f32(clamp_max); + for (; array_ptr != array_end_ptr; array_ptr += bias_size) + { + int i = 0; + for (; i <= bias_size - 16; i += 16) + { + auto b0 = vld1q_f32(bias_data + i); + auto b1 = vld1q_f32(bias_data + i + 4); + auto b2 = vld1q_f32(bias_data + i + 8); + auto b3 = vld1q_f32(bias_data + i + 12); + auto a0 = vld1q_f32(array_ptr + i); + auto a1 = vld1q_f32(array_ptr + i + 4); + auto a2 = vld1q_f32(array_ptr + i + 8); + auto a3 = vld1q_f32(array_ptr + i + 12); + auto x0 = vaddq_f32(a0, b0); + auto x1 = vaddq_f32(a1, b1); + auto x2 = vaddq_f32(a2, b2); + auto x3 = vaddq_f32(a3, b3); + x0 = vmaxq_f32(clamp_min_vec, x0); + x1 = vmaxq_f32(clamp_min_vec, x1); + x2 = vmaxq_f32(clamp_min_vec, x2); + x3 = vmaxq_f32(clamp_min_vec, x3); + x0 = vminq_f32(clamp_max_vec, x0); + x1 = vminq_f32(clamp_max_vec, x1); + x2 = vminq_f32(clamp_max_vec, x2); + x3 = vminq_f32(clamp_max_vec, x3); + vst1q_f32(array_ptr + i, x0); + vst1q_f32(array_ptr + i + 4, x1); + vst1q_f32(array_ptr + i + 8, x2); + vst1q_f32(array_ptr + i + 12, x3); + } + for (; i <= bias_size - 4; i += 4) + { + auto b = vld1q_f32(bias_data + i); + auto a = vld1q_f32(array_ptr + i); + auto x = vaddq_f32(a, b); + x = vmaxq_f32(clamp_min_vec, x); + x = vminq_f32(clamp_max_vec, x); + vst1q_f32(array_ptr + i, x); + } + for (; i < bias_size; i++) + { + array_ptr[i] = + ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max); + } + } +#else // not NEON + for (int array_offset = 0; array_offset < array_size; array_offset += bias_size) + { + for (int i = 0; i < bias_size; i++) + { + array_data[array_offset + i] = ActivationFunctionWithMinMax( + array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); + } + } +#endif +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_COMMON_H__ diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h new file mode 100644 index 000000000..47eb6034c --- /dev/null +++ b/compute/cker/include/cker/operation/Comparison.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_COMPARISON_H__ +#define __NNFW_CKER_COMPARISON_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> inline bool EqualFn(T lhs, T rhs) { return lhs == rhs; } +template <typename T> inline bool NotEqualFn(T lhs, T rhs) { return lhs != rhs; } +template <typename T> inline bool GreaterFn(T lhs, T rhs) { return lhs > rhs; } +template <typename T> inline bool GreaterEqualFn(T lhs, T rhs) { return lhs >= rhs; } +template <typename T> inline bool LessFn(T lhs, T rhs) { return lhs < rhs; } +template <typename T> inline bool LessEqualFn(T lhs, T rhs) { return lhs <= rhs; } + +template <typename T> using ComparisonFn = bool (*)(T, T); + +template <typename T, ComparisonFn<T> F> +inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, bool *output_data) +{ + const int64_t flatsize = // number of data.... + MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int64_t i = 0; i < flatsize; ++i) + { + output_data[i] = F(input1_data[i], input2_data[i]); + } +} + +template <ComparisonFn<float> F> +inline void Comparison(const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, bool *output_data) +{ + ComparisonImpl<float, F>(input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); +} + +template <typename T, ComparisonFn<int32_t> F> +inline void ComparisonWithScaling(ComparisonParams ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, + bool *output_data) +{ + int left_shift = params.left_shift; + int32_t input1_offset = params.input1_offset; + int32_t input1_multiplier = params.input1_multiplier; + int input1_shift = params.input1_shift; + int32_t input2_offset = params.input2_offset; + int32_t input2_multiplier = params.input2_multiplier; + int input2_shift = params.input2_shift; + const int64_t flatsize = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int64_t i = 0; i < flatsize; ++i) + { + const int32_t input1_val = input1_offset + input1_data[i]; + const int32_t input2_val = input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, input1_multiplier, input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, input2_multiplier, input2_shift); + output_data[i] = F(scaled_input1_val, scaled_input2_val); + } +} + +template <typename T, ComparisonFn<T> F> +inline void +BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, bool *output_data) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + output_data[Offset(output_shape, b, y, x, c)] = + F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]); + } + } + } + } +} + +template <typename T, ComparisonFn<T> F> +inline void BroadcastComparison4DSlow(const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, bool *output_data) +{ + BroadcastComparison4DSlowImpl<T, F>(input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); +} + +template <typename T, ComparisonFn<int32_t> F> +inline void BroadcastComparison4DSlowWithScaling(ComparisonParams ¶ms, + const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, bool *output_data) +{ + assert(input1_shape.DimensionsCount() <= 4); + assert(input2_shape.DimensionsCount() <= 4); + assert(output_shape.DimensionsCount() <= 4); + + int left_shift = params.left_shift; + int32_t input1_offset = params.input1_offset; + int32_t input1_multiplier = params.input1_multiplier; + int input1_shift = params.input1_shift; + int32_t input2_offset = params.input2_offset; + int32_t input2_multiplier = params.input2_multiplier; + int input2_shift = params.input2_shift; + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + const int32_t input1_val = + input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; + const int32_t input2_val = + input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, input1_multiplier, input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, input2_multiplier, input2_shift); + output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val); + } + } + } + } +} + +#define TFLITE_COMPARISON_OP(name) \ + template <typename T> \ + inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \ + const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ + output_data); \ + } \ + template <typename T> \ + inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void name##WithScaling(ComparisonParams ¶ms, const Shape &input1_shape, \ + const T *input1_data, const Shape &input2_shape, \ + const T *input2_data, const Shape &output_shape, \ + bool *output_data) \ + { \ + ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##WithScaling(ComparisonParams ¶ms, \ + const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \ + } + +TFLITE_COMPARISON_OP(Equal); +TFLITE_COMPARISON_OP(NotEqual); +TFLITE_COMPARISON_OP(Greater); +TFLITE_COMPARISON_OP(GreaterEqual); +TFLITE_COMPARISON_OP(Less); +TFLITE_COMPARISON_OP(LessEqual); +#undef TFLITE_COMPARISON_OP + +} // namespace cker +} // namespace nnfw + +#endif diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h new file mode 100644 index 000000000..394123e30 --- /dev/null +++ b/compute/cker/include/cker/operation/Concatenation.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONCATENATION_H__ +#define __NNFW_CKER_CONCATENATION_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cstdint> +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +inline void Concatenation(const ConcatenationParams ¶ms, const Shape *const *input_shapes, + const Scalar *const *input_data, const Shape &output_shape, + Scalar *output_data) +{ + int axis = params.axis; + int inputs_count = params.inputs_count; + const int concat_dimensions = output_shape.DimensionsCount(); + assert(axis < concat_dimensions); + + int64_t concat_size = 0; + for (int i = 0; i < inputs_count; i++) + { + assert(input_shapes[i]->DimensionsCount() == concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) + { + if (j != axis) + { + auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j); + UNUSED_RELEASE(dim_checked); + } + } + concat_size += input_shapes[i]->Dims(axis); + } + assert(concat_size == output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= output_shape.Dims(i); + } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) + { + base_inner_size *= output_shape.Dims(i); + } + + Scalar *output_ptr = output_data; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < inputs_count; ++i) + { + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar)); + output_ptr += copy_size; + } + } +} + +// quantized as it takes scale as a floating point value. This should be fixed +// when optimizng this routine further. +inline void ConcatenationWithScaling(const ConcatenationParams ¶ms, + const Shape *const *input_shapes, + const uint8_t *const *input_data, const Shape &output_shape, + uint8_t *output_data) +{ + int axis = params.axis; + const int32_t *input_zeropoint = params.input_zeropoint; + const float *input_scale = params.input_scale; + int inputs_count = params.inputs_count; + const int32_t output_zeropoint = params.output_zeropoint; + const float output_scale = params.output_scale; + + const int concat_dimensions = output_shape.DimensionsCount(); + assert(axis <= concat_dimensions); + + int64_t concat_size = 0; + for (int i = 0; i < inputs_count; i++) + { + assert(input_shapes[i]->DimensionsCount() == concat_dimensions); + for (int j = 0; j < concat_dimensions; j++) + { + if (j != axis) + { + assert(input_shapes[i]->Dims(j) == output_shape.Dims(j)); + } + } + concat_size += input_shapes[i]->Dims(axis); + } + assert(concat_size == output_shape.Dims(axis)); + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= output_shape.Dims(i); + } + // For all input arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < concat_dimensions; ++i) + { + base_inner_size *= output_shape.Dims(i); + } + + const float inverse_output_scale = 1.f / output_scale; + uint8_t *output_ptr = output_data; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < inputs_count; ++i) + { + const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; + const uint8_t *input_ptr = input_data[i] + k * copy_size; + if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale) + { + memcpy(output_ptr, input_ptr, copy_size); + } + else + { + const float scale = input_scale[i] * inverse_output_scale; + const float bias = -input_zeropoint[i] * scale; + for (int j = 0; j < copy_size; ++j) + { + const int32_t value = + static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint; + output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0)); + } + } + output_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H__ diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h new file mode 100644 index 000000000..b20bac3ac --- /dev/null +++ b/compute/cker/include/cker/operation/Conv.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CONV_H__ +#define __NNFW_CKER_CONV_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/operation/reference/Conv.h" +#include "cker/operation/optimized/Conv.h" +#include <iostream> +#include <vector> + +namespace nnfw +{ +namespace cker +{ + +namespace +{ +// Naive implementation of transpose for floats. Could be optimized to be more +// cache friendly, but for now it's a one-time cost on first run, and we would +// prefer to remove the need to do this at all eventually. +inline void TransposeFloatTensor(const float *input_data, const nnfw::cker::Shape &output_shape, + float *output_data) +{ + const int rows = output_shape.Dims(1); + const int cols = output_shape.Dims(0); + for (int i = 0; i < rows; ++i) + { + for (int j = 0; j < cols; ++j) + { + const float in_value = input_data[i * cols + j]; + output_data[j * rows + i] = in_value; + } + } +} +} // namespace + +class Conv +{ +public: + Conv() : _modified_filter_data(), _im2col_shape(4), _need_im2col(false), _prepared(false) {} + + void prepare(const Shape &filter_shape, const float *filter_data, PaddingType padding_type, + bool &is_replaced_weights, uint32_t dilationWidthFactor, + uint32_t dilationHeightFactor) + { + if (!_prepared) + { + if (usableMultiThreaded(padding_type, dilationWidthFactor, dilationHeightFactor)) + { + transposeFilter(filter_shape, filter_data, is_replaced_weights); + } + _prepared = true; + } + } + + void prepareQuant(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape, + uint32_t stride_width, uint32_t stride_height, uint32_t dilation_width_factor, + uint32_t dilation_height_factor) + { + if (!_prepared) + { + IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height, + dilation_width_factor, dilation_height_factor); + _prepared = true; + } + } + + void operator()(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) + { + if (usableMultiThreaded(params.padding_type, params.dilation_width_factor, + params.dilation_height_factor)) + { + bool transposed_in_execution = false; + if (!_prepared) + { + // This means that filter is not constant + // TODO Apply optimized kernel if multithreaded kernel is slower than optimized kernel by + // transposing filter data + transposeFilter(filter_shape, filter_data, transposed_in_execution); + } + multithreaded::Conv(params, input_shape, input_data, filter_shape, &_modified_filter_data[0], + bias_shape, bias_data, output_shape, output_data); + } + else + { + // TODO Support optimized kernel + reference::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data); + } + } + + void operator()(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) + { + if (!_prepared) + { + // This means that input or output are dynamic or filter is not constant + IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width, + params.stride_height, params.dilation_width_factor, + params.dilation_height_factor); + } + + int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 1; + + // Use heap if size is larger than 8MB + if (im2col_size > 8 * 1024 * 1024) + { + std::unique_ptr<uint8_t[]> im2col_data = std::make_unique<uint8_t[]>(im2col_size); + optimized::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data, _im2col_shape, im2col_data.get()); + } + else + { + uint8_t im2col_data[im2col_size]; + optimized::Conv(params, input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data, _im2col_shape, im2col_data); + } + } + +private: + bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor, + int32_t dilation_height_factor) + { + return padding_type != PaddingType::kNone && std::thread::hardware_concurrency() > 1 && + dilation_width_factor == 1 && dilation_height_factor == 1; + } + + void transposeFilter(const Shape &filter_shape, const float *filter_data, + bool &is_replaced_weights) + { + const auto output_depth = filter_shape.Dims(0); + const Shape hwcn_filter_shape{filter_shape.FlatSize() / output_depth, output_depth}; + _modified_filter_data.resize(hwcn_filter_shape.FlatSize()); + TransposeFloatTensor(filter_data, hwcn_filter_shape, &_modified_filter_data[0]); + is_replaced_weights = true; + } + + void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape, + const Shape &output_shape, uint32_t stride_width, uint32_t stride_height, + uint32_t dilation_width_factor, uint32_t dilation_height_factor) + { + const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; + const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 || + kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1; + + _need_im2col = need_dilated_im2col || need_non_dilated_im2col; + + if (_need_im2col) + { + _im2col_shape.SetDim(0, output_shape.Dims(0)); + _im2col_shape.SetDim(1, output_shape.Dims(1)); + _im2col_shape.SetDim(2, output_shape.Dims(2)); + _im2col_shape.SetDim(3, input_shape.Dims(3) * kernel_shape.Dims(1) * kernel_shape.Dims(2)); + } + } + +private: + std::vector<float> _modified_filter_data; + Shape _im2col_shape; + bool _need_im2col; + bool _prepared; +}; +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CONCATENATION_H_ diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h new file mode 100644 index 000000000..814a9e019 --- /dev/null +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__ +#define __NNFW_CKER_DEPTHWISE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" +#include "cker/operation/optimized/DepthwiseConvUint8.h" + +namespace nnfw +{ +namespace cker +{ + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + assert(dilation_width_factor >= 1); + assert(dilation_height_factor >= 1); + UNUSED_RELEASE(dilation_width_factor); + UNUSED_RELEASE(dilation_height_factor); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(output_activation_min <= output_activation_max); + UNUSED_RELEASE(output_activation_min); + UNUSED_RELEASE(output_activation_max); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_depth = input_shape.Dims(3); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(input_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(depth_multiplier); + +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) +// TODO Use below codes + +// const int stride_width = params.stride_width; +// const int stride_height = params.stride_height; +// const int pad_width = params.padding_values.width; +// const int pad_height = params.padding_values.height; +// const int output_shift = params.output_shift; +// +// // Call kernel optimized for depthwise convolutions using 3x3 filters if +// // parameters are supported. +// if (Fast3x3FilterKernelSupported( +// input_shape, filter_shape, stride_width, stride_height, +// dilation_width_factor, dilation_height_factor, pad_width, pad_height, +// depth_multiplier, output_shape, output_shift)) { +// DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape, +// filter_data, bias_shape, bias_data, output_shape, +// output_data); +// return; +// } +#endif + + optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data); +} + +inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int ic = 0; ic < input_depth; ++ic) + { + for (int m = 0; m < depth_multiplier; m++) + { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)]; + float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; + total += (input_value * filter_value); + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[oc]; + } + output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax( + total + bias_value, output_activation_min, output_activation_max); + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_DEPTHWISE_CONV_H__ diff --git a/compute/cker/include/cker/operation/Dequantize.h b/compute/cker/include/cker/operation/Dequantize.h new file mode 100644 index 000000000..c4875812b --- /dev/null +++ b/compute/cker/include/cker/operation/Dequantize.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEQUANTIZE_H__ +#define __NNFW_CKER_DEQUANTIZE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ + +#ifdef USE_NEON +namespace +{ +inline void ScaleWithNewZeroPoint(const int32x4_t input, const float32x4_t scale_dup, + const float32x4_t zero_times_scale_dup, float32x4_t *output) +{ +#ifdef __ARM_FEATURE_FMA + *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup); +#else + *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup), zero_times_scale_dup); +#endif +} +} // namespace +#endif // USE_NEON + +inline void Dequantize(const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, float *output_data, const float scale, + const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + int i = 0; +#ifdef USE_NEON + const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale)); + const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale)); + for (; i <= flat_size - 8; i += 8) + { + const uint8x8_t input_u8 = vld1_u8(input_data + i); + const uint16x8_t input_u16 = vmovl_u8(input_u8); + const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16); + const int16x4_t input_s16_low = vget_low_s16(input_s16); + const int16x4_t input_s16_high = vget_high_s16(input_s16); + const int32x4_t val_low = vmovl_s16(input_s16_low); + const int32x4_t val_high = vmovl_s16(input_s16_high); + + float32x4_t result_low, result_high; + ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low); + ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high); + + vst1q_f32(output_data + i, result_low); + vst1q_f32(output_data + i + 4, result_high); + } +#endif // NEON + for (; i < flat_size; ++i) + { + const int32_t val = input_data[i]; + const float result = static_cast<float>(scale * (val - zero_point)); + output_data[i] = result; + } +} + +inline void Dequantize(const Shape &input_shape, const int8_t *input_data, + const Shape &output_shape, float *output_data, const float scale, + const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + int i = 0; +#ifdef USE_NEON + const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale)); + const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale)); + for (; i <= flat_size - 8; i += 8) + { + const int8x8_t input_s8 = vld1_s8(input_data + i); + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x4_t input_s16_low = vget_low_s16(input_s16); + const int16x4_t input_s16_high = vget_high_s16(input_s16); + const int32x4_t val_low = vmovl_s16(input_s16_low); + const int32x4_t val_high = vmovl_s16(input_s16_high); + + float32x4_t result_low, result_high; + ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low); + ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high); + + vst1q_f32(output_data + i, result_low); + vst1q_f32(output_data + i + 4, result_high); + } +#endif // NEON + for (; i < flat_size; ++i) + { + const int32_t val = input_data[i]; + const float result = static_cast<float>(scale * (val - zero_point)); + output_data[i] = result; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_DEQUANTIZE_H__ diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h new file mode 100644 index 000000000..3d1837f47 --- /dev/null +++ b/compute/cker/include/cker/operation/Einsum.h @@ -0,0 +1,934 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EINSUM_H__ +#define __NNFW_CKER_EINSUM_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/operation/Helper/Tensor.h" +#include "cker/operation/Helper/MatmulBCast.h" + +#include "Transpose.h" +#include "BatchMatMul.h" + +#include <string> +#include <vector> +#include <map> +#include <numeric> +#include <algorithm> + +namespace nnfw +{ +namespace cker +{ + +namespace functor +{ + +template <typename Device, typename T, int N> struct StrideFunctor +{ + void operator()(const Device &d, typename TTypes<T, N>::ConstTensor input, + const std::vector<int32_t> &strides, typename TTypes<T, N>::Tensor output) + { + + Eigen::DSizes<Eigen::DenseIndex, N> dsizes; + for (size_t d = 0; d < strides.size(); d++) + { + dsizes[d] = static_cast<Eigen::DenseIndex>(strides[d]); + } + for (size_t d = strides.size(); d < N; d++) + { + dsizes[d] = 1; + } + + output.device(d) = input.stride(dsizes); + } +}; + +template <typename Device, typename T, int N> struct InflateFunctor +{ + void operator()(const Device &d, typename TTypes<T, N>::ConstTensor input, + const std::vector<int32_t> &strides, typename TTypes<T, N>::Tensor output) + { + + Eigen::DSizes<Eigen::DenseIndex, N> dsizes; + for (size_t d = 0; d < strides.size(); d++) + { + dsizes[d] = static_cast<Eigen::DenseIndex>(strides[d]); + } + for (size_t d = strides.size(); d < N; d++) + { + dsizes[d] = 1; + } + + output.device(d) = input.inflate(dsizes); + } +}; + +template <typename Device, typename Reducer> struct ReduceFunctor +{ + template <typename OUT_T, typename IN_T, typename ReductionAxes> + static void Reduce(const Device &d, OUT_T out, IN_T in, const ReductionAxes &reduction_axes, + const Reducer &reducer) + { + out.device(d) = in.reduce(reduction_axes, reducer); + } +}; + +template <typename Device, typename T> struct SetZeroFunctor +{ + // Computes on device "d": out = out.setZero(), + void operator()(const Device &d, typename TTypes<T>::Flat out) + { + out.device(d) = out.constant(T(0)); + } +}; + +} // namespace functor + +using ShapeVec = std::vector<int32_t>; +using Labels = std::vector<int32_t>; +using OperandLabels = std::vector<Labels>; +using LabelCounts = std::vector<int32_t>; +using OperandLabelCounts = std::vector<LabelCounts>; +using LabelToDimSizes = std::vector<int32_t>; + +// Each dimension is categorized into exactly one of five types based on +// whether its corresponding label is present in the input and/or the output +// subscripts. +enum DimensionType +{ + // Batch dimensions are those present in two inputs as well as the output. + // They are part of the batch dimensions during Tensor contraction. + // Such dimensions may be broadcasting dimensions (those mapping to + // ellipsis) + // or explicit batch dimensions corresponding to named axis labels. + kBroadcasting = 0, + kBatch = 1, + // Free dimensions are present in exactly one of the inputs, and also the + // output. These are non-contracted axes in the Tensor contraction. + kFree = 2, + // Contract dimensions are present in two inputs, but not the output. These + // dimensions are contracted in Tensor contraction. + kContract = 3, + // Reduce dimensions are present in exactly one input; and not in the output + // and are summed over prior to Tensor contraction. + kReduce = 4, +}; + +namespace +{ + +constexpr int kEllipsisLabel = -1; + +std::vector<std::string> strSplit(const std::string &text, const std::string delimiter) +{ + std::vector<std::string> result; + + size_t start = 0; + size_t pos = 0; + + do + { + pos = text.find(delimiter, start); + if (pos == std::string::npos) + { + result.push_back(text.substr(start, text.size() - start)); + break; + } + + result.push_back(text.substr(start, pos - start)); + start = pos + delimiter.size(); + } while (pos != std::string::npos); + + return result; +} + +inline DimensionType getDimensionType(bool is_removed, bool is_unique) +{ + if (!is_removed && !is_unique) + return kBatch; + else if (!is_removed && is_unique) + return kFree; + else if (is_removed && !is_unique) + return kContract; + else // is_removed && is_unique + return kReduce; +} + +inline Shape copyShape(const Shape &shape) +{ + return Shape::ExtendedShape(shape.DimensionsCount(), shape); +} +} + +class Einsum +{ +public: + Einsum() : _prepared(false) + { + // DO NOTHING + } + + void prepare(std::string &equation) + { + if (_prepared) + { + return; + } + + // Parse equation + parseEquation(equation); + _prepared = true; + } + + void operator()(std::string &equation, const std::vector<Shape> &input_shapes, + const std::vector<const float *> &input_data, const Shape &output_shape, + float *output_data) + { + if (!_prepared) + { + prepare(equation); + } + + const int num_inputs = input_shapes.size(); + std::vector<InputTensor<float>> inputs(num_inputs); + for (int i = 0; i < num_inputs; i++) + { + inputs[i].shape.ReplaceWith(input_shapes[i].DimensionsCount(), input_shapes[i].DimsData()); + inputs[i].buffer = input_data[i]; + } + + OperandLabels input_labels(_input_labels); + Labels output_labels(_output_labels); + std::vector<DimensionType> label_types(_label_types); + OperandLabelCounts input_label_counts(_input_label_counts); + LabelCounts output_label_counts(_output_label_counts); + LabelToDimSizes label_to_dim_sizes; + + processDimensions(inputs, &input_labels, &output_labels, &label_types, &input_label_counts, + &output_label_counts, &label_to_dim_sizes); + + // The reduction phase (a) sums across reduction dimensions, (b) takes + // generalized diagonals, and (c) reshapes it into shape + // [(broadcasting) batch shape] + [F,C] + // where F and C denote the total (compacted) size of free and contract + // dimensions, respectively. + + OperandLabels free_labels(num_inputs); + std::vector<Tensor> inputs_reduced(num_inputs); + std::vector<bool> swap_free_and_contract(num_inputs); + for (int i = 0; i < num_inputs; ++i) + { + bool temp_swap_free_and_contract = false; + reduceOperand<float>(inputs[i], label_types, input_label_counts[i], &input_labels[i], + &free_labels[i], &temp_swap_free_and_contract, &inputs_reduced[i]); + swap_free_and_contract[i] = temp_swap_free_and_contract; + } + + // After reduction, the inputs should be reshaped to Tensors suitable for + // contraction. If num_inputs is 1, the reduced input is simply forwarded to + // the output. + Tensor contraction_output_reshaped; + contractOperands(inputs_reduced, swap_free_and_contract, &contraction_output_reshaped); + + // Copy the batch labels from the contraction output. Recover the batch + // shape, which may have been broadcasted. + std::vector<int32_t> result_shape_dims(contraction_output_reshaped.shape.DimensionsCount() - 2); + + for (size_t i = 0; i < result_shape_dims.size(); i++) + { + result_shape_dims[i] = contraction_output_reshaped.shape.Dims(i); + } + + int num_labels = label_types.size(); + Labels result_labels; + // All batch dimensions should be present in the contracted result. First + // the broadcasting dimensions, then the named batch dimensions. + for (int label = 0; label < num_labels; ++label) + { + if (label_types[label] == kBroadcasting) + result_labels.push_back(label); + } + for (int label = 0; label < num_labels; ++label) + { + if (label_types[label] == kBatch) + result_labels.push_back(label); + } + for (int i = 0; i < num_inputs; ++i) + { + for (int label : free_labels[i]) + { + result_labels.push_back(label); + result_shape_dims.push_back(label_to_dim_sizes[label]); + } + } + + Shape result_shape(result_shape_dims.size(), result_shape_dims.data()); + + // Reshape the contraction (or reduction) result to its expanded shape: + // [(broadcasted) batch shape] + [free shape 0] + [free shape 1]. + Tensor contraction_output; + copyFrom(contraction_output_reshaped, result_shape, &contraction_output); + + // Inflate the output if necessary. (E.g. for the equation 'i->iii' which + // may arise while computing gradient of a regular Einsum). + // TODO(anudhyan): It's possible that Eigen's contract and inflate can be + // chained here to avoid materializing an intermediate. + Tensor output_inflated; + strideOrInflate<float>(contraction_output, result_labels, output_label_counts, + true /* should_inflate */, &output_inflated); + + if (output_inflated.shape.DimensionsCount() > contraction_output.shape.DimensionsCount()) + { + // We inflated the output. Modify result labels accordingly. + Labels inflated_labels; + for (int label : result_labels) + { + inflated_labels.insert(inflated_labels.end(), output_label_counts[label], label); + } + result_labels.swap(inflated_labels); + } + + // Find the permutation to map the result labels to the output labels. Note + // that both the result and the final output may have the repeated labels, + // in which case the permutation preserves the left-to-right ordering. + // E.g. if result labels are [0, 0, 1] and output is [0, l, 0] then the + // permutation should be [0, 2, 1]. We also use the fact that repeated + // labels in the result are adjacent to each other. + std::vector<int32_t> output_permutation(output_labels.size()); + std::vector<int32_t> label_to_position(num_labels, -1); + for (size_t i = 0; i < result_labels.size(); ++i) + { + // Remember the position of only the leftmost result label. + if (label_to_position[result_labels[i]] == -1) + { + label_to_position[result_labels[i]] = i; + } + } + for (size_t i = 0; i < output_labels.size(); ++i) + { + output_permutation[i] = label_to_position[output_labels[i]]; + // We have found the leftmost occurrence. The next one would be adjacent. + label_to_position[output_labels[i]] += 1; + } + + InputTensor<float> temp_inflated; + temp_inflated.shape.ReplaceWith(output_inflated.shape.DimensionsCount(), + output_inflated.shape.DimsData()); + temp_inflated.buffer = (reinterpret_cast<const float *>(output_inflated.buffer)); + ; + + Tensor output; + transposeOperand<float>(temp_inflated, output_permutation, &output); + + memcpy(output_data, output.buffer, output_shape.FlatSize() * sizeof(float)); + + temp_operand.clear(); + } + +private: + void parseEquation(std::string &equation) + { + std::vector<std::string> input_str; + std::string output_str; + + parseEinsumEquation(equation, input_str, output_str); + + // Temporary map from single character labels to (consecutive) integer + // labels. + std::map<char, int> label_mapping; + int num_inputs = input_str.size(); + _input_labels.resize(num_inputs); + + // Map from single characters to integer labels. + for (int i = 0; i < num_inputs; ++i) + { + mapToLabels(input_str[i], _input_labels.at(i), label_mapping); + } + mapToLabels(output_str, _output_labels, label_mapping); + + // Compute counts for input and output labels. + int num_labels = label_mapping.size(); + _input_label_counts.resize(num_inputs); + _input_has_ellipsis.resize(num_inputs); + for (int i = 0; i < num_inputs; ++i) + { + _input_label_counts.at(i).resize(num_labels); + for (const int label : _input_labels.at(i)) + { + if (label != kEllipsisLabel) + _input_label_counts.at(i)[label] += 1; + else + _input_has_ellipsis.at(i) = true; + } + } + _output_label_counts.resize(num_labels); + for (const int label : _output_labels) + { + if (label != kEllipsisLabel) + _output_label_counts.at(label) += 1; + else + _output_has_ellipsis = true; + } + + // Map each label to a unique DimensionType. + _label_types.resize(num_labels); + for (int label = 0; label < num_labels; ++label) + { + bool removed = (_output_label_counts[label] == 0); + bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 || + _input_label_counts[1][label] == 0; + _label_types[label] = getDimensionType(removed, unique); + } + } + + void parseEinsumEquation(const std::string &equation, std::vector<std::string> &input_subscripts, + std::string &output_subscript) + { + std::vector<std::string> inputs_and_output_subscripts = strSplit(equation, "->"); + if (inputs_and_output_subscripts.size() != 2) + { + throw std::runtime_error{"Einsum: Expecting exactly one '->' in einsum equation: " + + equation}; + } + + output_subscript = inputs_and_output_subscripts[1]; + input_subscripts = strSplit(inputs_and_output_subscripts[0], ","); + if (input_subscripts.size() != 1 && input_subscripts.size() != 2) + { + throw std::runtime_error{"Einsum: Expecting 1 or 2 input subscripts in equation '" + + equation + "' but got: " + std::to_string(input_subscripts.size())}; + } + } + + // Maps the character labels to consecutive integers. + void mapToLabels(const std::string &subscript, Labels &labels, std::map<char, int> &label_mapping) + { + for (size_t i = 0; i < subscript.size(); ++i) + { + const char label_char = subscript[i]; + if (label_char == '.') + { + labels.push_back(kEllipsisLabel); + i += 2; // Skip next 2 characters as well. + continue; + } + if (label_mapping.find(label_char) == label_mapping.end()) + { + const int next_label = label_mapping.size(); + label_mapping[label_char] = next_label; + } + const int mapped_label = label_mapping[label_char]; + labels.push_back(mapped_label); + } + } + + template <typename T> + void processDimensions(const std::vector<InputTensor<T>> &inputs, OperandLabels *input_labels, + Labels *output_labels, std::vector<DimensionType> *label_types, + OperandLabelCounts *input_label_counts, LabelCounts *output_label_counts, + LabelToDimSizes *label_to_dim_sizes) + { + if (inputs.size() != input_labels->size()) + { + throw std::runtime_error{"Expected " + std::to_string(input_labels->size()) + + " inputs but got: " + std::to_string(inputs.size())}; + } + const int num_inputs = inputs.size(); + + // We infer the number of broadcasting dimensions by taking the maximum rank + // among the broadcasting subshapes of the input. + int max_bcast_dims = 0; + const int num_named_labels = label_types->size(); + label_to_dim_sizes->resize(num_named_labels); + for (int i = 0; i < num_inputs; ++i) + { + Labels *labels = &(*input_labels)[i]; + + if (!_input_has_ellipsis[i]) + { + if (inputs[i].shape.DimensionsCount() != ((int32_t)labels->size())) + { + throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank " + + std::to_string(labels->size()) + " but got: " + + std::to_string(inputs[i].shape.DimensionsCount())}; + } + for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx) + { + const int label = (*labels)[label_idx]; + recordLabelToDimension(label, label_idx, inputs[i].shape, label_to_dim_sizes); + } + continue; + } + + // Input has an ellipsis. + if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size()) + { + throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " + + std::to_string(labels->size() - 1) + " but got: " + + std::to_string(inputs[i].shape.DimensionsCount())}; + } + int ellipsis_axis = -1; + const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1; + for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx) + { + const int label = (*labels)[label_idx]; + if (label == kEllipsisLabel) + { + ellipsis_axis = label_idx; + continue; + } + // Current label is not an ellipsis. + const int axis = label_idx + (ellipsis_axis == -1 ? 0 : num_bcast_dims - 1); + recordLabelToDimension(label, axis, inputs[i].shape, label_to_dim_sizes); + } + // Found an ellipsis. Replace 'kEllipsisLabel' with broadcasting + // dimensions. + if (ellipsis_axis != -1) + { + insertBroadcastLabels(num_bcast_dims, num_named_labels, ellipsis_axis, labels, + &input_label_counts->at(i)); + max_bcast_dims = std::max(max_bcast_dims, num_bcast_dims); + } + } + + std::vector<bool>::iterator it_input = + std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true); + if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis) + { + return; + } + // Insert broadcasting dimensions in the output labels. + auto it = std::find(output_labels->begin(), output_labels->end(), kEllipsisLabel); + if (it != output_labels->end()) + { + const int ellipsis_axis = it - output_labels->begin(); + insertBroadcastLabels(max_bcast_dims, num_named_labels, ellipsis_axis, output_labels, + output_label_counts); + } + else if (max_bcast_dims > 0) + { + std::runtime_error{"Output contains " + std::to_string(max_bcast_dims) + + " broadcasting dimension(s) but no ellipsis " + + "(...) was found in the output subscripts."}; + } + // Populate DimensionType for the new broadcasting labels. + label_types->resize(num_named_labels + max_bcast_dims, kBroadcasting); + } + + void recordLabelToDimension(const int32_t label, const int axis, const Shape &input_shape, + LabelToDimSizes *label_to_dim_sizes) + { + const int32_t input_dim = input_shape.Dims(axis); + // We know that label_to_dim_sizes has the size to accommodate named labels. + if (label_to_dim_sizes->at(label) != 0 && label_to_dim_sizes->at(label) != input_dim) + { + std::runtime_error{"Expected dimension " + std::to_string(label_to_dim_sizes->at(label)) + + " at axis " + std::to_string(axis) + + " of the input shaped but got dimension " + std::to_string(input_dim)}; + } + (*label_to_dim_sizes)[label] = input_dim; + } + + void insertBroadcastLabels(int num_bcast_dims, int num_named_labels, int ellipsis_axis, + Labels *labels, LabelCounts *label_counts) + { + labels->erase(labels->begin() + ellipsis_axis); + labels->insert(labels->begin() + ellipsis_axis, num_bcast_dims, 0); + std::iota(labels->begin() + ellipsis_axis, labels->begin() + ellipsis_axis + num_bcast_dims, + num_named_labels); + // Increment label counts. Since these are new labels, the count is set + // to 1. + label_counts->resize(num_named_labels + num_bcast_dims, 1); + } + + template <typename T> + void reduceOperand(const InputTensor<T> &input, const std::vector<DimensionType> &label_types, + const LabelCounts &label_counts, Labels *labels, Labels *free_labels, + bool *swap_free_and_contract, Tensor *output) + { + // Find the permutation to transpose the input dimensions in the order of + // DimensionType; i.e. batch, free, contract and reduce dimensions. This + // makes it more convenient to invoke Reduce/Contract operations. + std::vector<int32_t> permutation(input.shape.DimensionsCount()); + std::iota(permutation.begin(), permutation.end(), 0); + Tensor input_transposed; + + // Check if we can avoid the transpose. We need to flip the adj_x (or adj_y) + // flag during BatchMatMul. This is an extra optimization not necessary for + // correctness. + if (shouldSwapFreeAndContract(*labels, label_types)) + { + *swap_free_and_contract = true; + } + else + { + std::sort(permutation.begin(), permutation.end(), [&](int i, int j) { + int label_i = (*labels)[i]; + int label_j = (*labels)[j]; + return std::tie(label_types[label_i], label_i) < std::tie(label_types[label_j], label_j); + }); + } + // Transpose the input so that DimensionTypes are in order. + transposeOperand<T>(input, permutation, &input_transposed); + + permuteLabels(permutation, labels); + + // Take the generalized diagonal for dimensions with repeated axis labels. + Tensor input_deduped; + labels->erase(std::unique(labels->begin(), labels->end()), labels->end()); + strideOrInflate<T>(input_transposed, *labels, label_counts, false /* should_inflate */, + &input_deduped); + + // Reshape denotes the rank-5 shape [broadcast, batch, free, contract, + // reduce] where we've compacted the dimensions of each DimensionType. + std::vector<int32_t> reshape(5, 1); + + // The output shape is [batch shape] + [free size, contract size] + // That is, the batch shape is preserved (for broadcasting while + // contracting) while the free dims and contract dims are compressed to one + // dimension each. + Shape output_shape; + std::vector<int32_t> output_shape_dims; + for (size_t label_idx = 0; label_idx < labels->size(); ++label_idx) + { + const int label = labels->at(label_idx); + int32_t dim = input_deduped.shape.Dims(label_idx); + if (label_types[label] == kBroadcasting || label_types[label] == kBatch) + { + output_shape_dims.push_back(dim); + } + else if (label_types[label] == kFree) + { + free_labels->push_back(label); + } + reshape[label_types[label]] *= dim; + } + + if (*swap_free_and_contract) + std::swap(reshape[kFree], reshape[kContract]); + + output_shape_dims.push_back(reshape[kFree]); + output_shape_dims.push_back(reshape[kContract]); + + output_shape.ReplaceWith(output_shape_dims.size(), output_shape_dims.data()); + + if (reshape[kReduce] == 1) + { // No need to actually reduce. + return copyFrom(input_deduped, output_shape, output); + } + + allocateTemp(output_shape, output); + + using Reducer = Eigen::internal::SumReducer<T>; + using Index = typename TTypes<T>::Tensor::Index; + + const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice(); + + // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor. + const int32_t output_size = + reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract]; + functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce( + device, output->shaped<T, 1>({output_size}), + input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}), + Reducer()); + } + + bool shouldSwapFreeAndContract(const Labels &labels, + const std::vector<DimensionType> &label_types) + { + // Check that ordering is according to dimension type, with the role of + // free and contract dimensions swapped. + std::vector<int> remap = {0, 1, 3, 2, 4}; + for (size_t i = 0; i + 1 < labels.size(); ++i) + { + const int dimtype_a = remap[label_types[labels[i]]]; + const int dimtype_b = remap[label_types[labels[i + 1]]]; + if (dimtype_a > dimtype_b || (dimtype_a == dimtype_b && labels[i] > labels[i + 1])) + { + return false; + } + } + return true; + } + + template <typename T> + void transposeOperand(const InputTensor<T> &input, const std::vector<int32_t> &permutation, + Tensor *output) + { + if (!shouldTranspose(input.shape, permutation)) + { + copyFrom(input, input.shape, output); + return; + } + Shape transposed_shape(input.shape.DimensionsCount()); + for (int i = 0; i < input.shape.DimensionsCount(); ++i) + { + transposed_shape.SetDim(i, input.shape.Dims(permutation[i])); + } + // For empty Tensors, just change the shape. E.g. we may need to transpose + // from shape [1, 0, 5] to [5, 1, 0]. + if (input.shape.FlatSize() == 0) + { + copyFrom(input, transposed_shape, output); + return; + } + + temp_operand.emplace_back(std::make_unique<T[]>(transposed_shape.FlatSize())); + T *new_buffer = temp_operand.back().get(); + + TransposeParams transpose_params; + transpose_params.perm_count = permutation.size(); + for (size_t i = 0; i < permutation.size(); i++) + { + transpose_params.perm[i] = permutation[i]; + } + + Transpose<T>(transpose_params, input.shape, input.buffer, transposed_shape, new_buffer); + + output->shape.ReplaceWith(transposed_shape.DimensionsCount(), transposed_shape.DimsData()); + output->buffer = new_buffer; + } + + bool shouldTranspose(const Shape &input_shape, const std::vector<int32_t> &permutation) + { + if (input_shape.DimensionsCount() < 2) + return false; + for (size_t i = 0; i < permutation.size(); ++i) + { + if (permutation[i] != (int32_t)i) + return true; + } + return false; + } + + template <typename T> + void copyFrom(const InputTensor<T> &input, const Shape &shape, Tensor *output) + { + Tensor temp_tensor; + temp_tensor.shape.ReplaceWith(input.shape.DimensionsCount(), input.shape.DimsData()); + temp_operand.emplace_back(std::make_unique<float[]>(input.shape.FlatSize())); + temp_tensor.buffer = temp_operand.back().get(); + memcpy(temp_tensor.buffer, input.buffer, input.shape.FlatSize() * sizeof(float)); + + copyFrom(temp_tensor, shape, output); + } + + void copyFrom(const Tensor &input, const Shape &shape, Tensor *output) + { + if (output->copyFrom(input, shape)) + return; + + throw std::runtime_error{"Einsum: Encountered error while reshaping a Tensor"}; + } + + // Permutes the labels according to the given permutation. + void permuteLabels(const std::vector<int32_t> &permutation, Labels *labels) + { + Labels permuted_labels(labels->size()); + for (size_t i = 0; i < labels->size(); ++i) + { + permuted_labels[i] = (*labels)[permutation[i]]; + } + labels->swap(permuted_labels); + } + + // If there are repeated labels in either the input or output, then this + // strides the input (e.g. iii->i) or inflates it (e.g. i->iii), respectively. + template <typename T> + void strideOrInflate(const Tensor &input, const Labels &labels, const LabelCounts &label_counts, + const bool should_inflate, Tensor *output) + { + // Return early if there are no repeated indices. + if (std::all_of(label_counts.begin(), label_counts.end(), [](int c) { return c <= 1; })) + { + return copyFrom(input, input.shape, output); + } + // We reshape so that each repeated label is compressed to one dimension. + // E.g. For iiij -> ij, The shape [3, 3, 3, 5] would be compressed to [27, + // 5]. Striding appropriately (in this case with strides 14 (=1+3+9) and 1) + // recovers the generalized diagonal of shape [3, 5]. + std::vector<int32_t> reshape; + std::vector<int32_t> strides; + // Strided and inflated shapes correspond to input and output shapes, + // respectively, should_inflate is true (vice-versa if should_inflate is + // false). E.g. they are [3, 5] and [3, 3, 3, 5] in the above example. + Shape strided_shape; + Shape inflated_shape; + std::vector<int32_t> strided_shape_dims; + std::vector<int32_t> inflated_shape_dims; + for (int label : labels) + { + const int32_t count = label_counts[label]; + const int current_axis = + should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size(); + const int32_t dim = input.shape.Dims(current_axis); + strided_shape_dims.push_back(dim); + inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim); + const int32_t reshape_dim = std::pow(dim, count); + reshape.push_back(reshape_dim); + // While taking the d-diagonal in a rank k Tensor, we take d + // equally-spaced elements including the first and last element. Then, (k + // - 1) * stride = d^k - 1, or, stride = (d^k - 1)/(d - 1). + const int32_t stride = (dim > 1 && count > 1) ? (reshape_dim - 1) / (dim - 1) : 1; + strides.push_back(stride); + } + + strided_shape.ReplaceWith(strided_shape_dims.size(), strided_shape_dims.data()); + inflated_shape.ReplaceWith(inflated_shape_dims.size(), inflated_shape_dims.data()); + + Shape output_shape = Shape(should_inflate ? inflated_shape : strided_shape); + + output->shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData()); + temp_operand.emplace_back(std::make_unique<float[]>(output_shape.FlatSize())); + output->buffer = temp_operand.back().get(); + + const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice(); + + switch (reshape.size()) + { +#define NDIMS_CASE(N) \ + case N: \ + { \ + if (should_inflate) \ + { \ + auto output_map = output->shaped<T, N>(reshape); \ + auto input_map = input.shaped<T, N>(strided_shape_dims); \ + functor::InflateFunctor<Eigen::ThreadPoolDevice, T, N>()(device, input_map, strides, \ + output_map); \ + } \ + else \ + { \ + auto input_map = input.shaped<T, N>(reshape); \ + auto output_map = output->shaped<T, N>(strided_shape_dims); \ + functor::StrideFunctor<Eigen::ThreadPoolDevice, T, N>()(device, input_map, strides, \ + output_map); \ + } \ + } \ + break; + NDIMS_CASE(1); + NDIMS_CASE(2); + NDIMS_CASE(3); + NDIMS_CASE(4); + NDIMS_CASE(5); + NDIMS_CASE(6); + default: + throw std::runtime_error{"Unsupported rank: " + std::to_string(reshape.size()) + + " while handling repeated indices. Up to rank 6 is supported."}; +#undef NDIMS_CASE + } + } + + void allocateTemp(const Shape &shape, Tensor *output) + { + output->shape.ReplaceWith(shape.DimensionsCount(), shape.DimsData()); + temp_operand.emplace_back(std::make_unique<float[]>(shape.FlatSize())); + output->buffer = temp_operand.back().get(); + } + + // Contracts the inputs along the last axis. (or the second last if the + // corresponding value of swap_free_and_contract is true). The batch + // dimensions are broadcast to the output shape. + // TODO(anudhyan): Factor this function into a BatchMatMul functor and support + // transpose_x and transpose_y attributes (in addition to adj_x and adj_y). + // Also, the BatchMatMul might devolve into a component-wise multiplication + // when the matrix shape is [1,1]; in this case BatchMatMul functor would be + // very inefficient. The functor should detect if this is the case and perform + // componentwise multiplication functor instead. + void contractOperands(std::vector<Tensor> &inputs, std::vector<bool> &swap_free_and_contract, + Tensor *output) + { + if (inputs.size() == 1) + return copyFrom(inputs[0], inputs[0].shape, output); + + MatMulBCast bcast(inputs[0].shape, inputs[1].shape); + if (!bcast.IsValid()) + { + throw std::runtime_error{"Einsum: Invalid broadcasting dimensions"}; + } + + Tensor lhs; + reshapeToRank3(inputs[0], bcast.x_batch_size(), &lhs); + Tensor rhs; + reshapeToRank3(inputs[1], bcast.y_batch_size(), &rhs); + Shape old_output_shape = bcast.output_batch_shape(); + Shape output_shape(old_output_shape.DimensionsCount() + inputs.size()); + for (int i = 0; i < old_output_shape.DimensionsCount(); i++) + { + output_shape.SetDim(i, old_output_shape.Dims(i)); + } + + for (size_t i = 0; i < inputs.size(); ++i) + { + const int32_t free_axis = + inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2); + output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis)); + } + bool adj_x = swap_free_and_contract[0]; + bool adj_y = !swap_free_and_contract[1]; + + allocateTemp(output_shape, output); + + const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice(); + + if (lhs.shape.FlatSize() == 0 || rhs.shape.FlatSize() == 0) + { + functor::SetZeroFunctor<Eigen::ThreadPoolDevice, float> set_zero; + set_zero(device, + typename TTypes<float, 1>::Tensor(output->base<float>(), output->shape.FlatSize())); + return; + } + + Tensor output_reshaped; + reshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped); + + // LaunchBatchMatMul::Launch(lhs, rhs, adj_x, adj_y, bcast, &output_reshaped); + BatchMatMul batchMatMul; + batchMatMul.prepare(lhs.shape, rhs.shape, adj_x, adj_y); + batchMatMul(lhs.shape, lhs.base<float>(), rhs.shape, rhs.base<float>(), adj_x, adj_y, + output_reshaped.shape, output_reshaped.base<float>()); + } + + void reshapeToRank3(const Tensor &input, int batch_size, Tensor *output) + { + const int rank = input.shape.DimensionsCount(); + Shape output_shape({batch_size, input.shape.Dims(rank - 2), input.shape.Dims(rank - 1)}); + copyFrom(input, output_shape, output); + } + +private: + bool _prepared; + + OperandLabels _input_labels; + Labels _output_labels; + std::vector<DimensionType> _label_types; + OperandLabelCounts _input_label_counts; + LabelCounts _output_label_counts; + std::vector<bool> _input_has_ellipsis; + bool _output_has_ellipsis = false; + + std::vector<std::unique_ptr<float[]>> temp_operand; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EINSUM_H__ diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h new file mode 100644 index 000000000..9d080d89b --- /dev/null +++ b/compute/cker/include/cker/operation/Elementwise.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ELEMENTWISE_H__ +#define __NNFW_CKER_ELEMENTWISE_H__ + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void Sin(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = std::sin(input_data[i]); + } +} + +inline void Cos(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = std::cos(input_data[i]); + } +} + +inline void Abs(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); + output_map.array() = input_map.array().abs(); +} + +inline void Rsqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = 1.f / std::sqrt(input_data[i]); + } +} + +template <typename T> +inline void Neg(const Shape &input_shape, const T *input_data, const Shape &output_shape, + T *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = -input_data[i]; + } +} + +inline void Log(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = std::log(input_data[i]); + } +} + +inline void Floor(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = std::floor(input_data[i]); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ELEMENTWISE_H__ diff --git a/compute/cker/include/cker/operation/Erf.h b/compute/cker/include/cker/operation/Erf.h new file mode 100644 index 000000000..a9be3654a --- /dev/null +++ b/compute/cker/include/cker/operation/Erf.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ERF_H__ +#define __NNFW_CKER_ERF_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Erf(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = std::erf(input_data[i]); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ERF_H__ diff --git a/compute/cker/include/cker/operation/Exp.h b/compute/cker/include/cker/operation/Exp.h new file mode 100644 index 000000000..ed3c73d73 --- /dev/null +++ b/compute/cker/include/cker/operation/Exp.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EXP_H__ +#define __NNFW_CKER_EXP_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void Exp(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = std::exp(input_data[i]); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EXP_H__ diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h new file mode 100644 index 000000000..14daf9839 --- /dev/null +++ b/compute/cker/include/cker/operation/Fill.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FILL_H__ +#define __NNFW_CKER_FILL_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ +template <typename T> +inline void Fill(const Shape &input_shape, int *input_data, const T value_data, + const Shape &output_shape, T output_data) +{ + int input_size = input_shape.FlatSize(); + int output_size = 1; + for (int i = 0; i < input_size; i++) + { + output_size *= input_data[i]; + } + + if (output_size == output_shape.FlatSize()) + { + for (int i = 0; i < output_size; i++) + { + output_data[i] = *value_data; + } + } + else + { + throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output"); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FILL_H__ diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h new file mode 100644 index 000000000..958532402 --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_H__ +#define __NNFW_CKER_FULLY_CONNECTED_H__ + +#include <ruy/context.h> +#include "cker/operation/FullyConnectedDense16x1.h" +#include "cker/operation/FullyConnectedSparse16x1.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/TensorUtils.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ + +class FCTempArena +{ +public: + FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch() + { + // DO NOTHING + } + + void prepare(const Shape &input_shape, const Shape &weights_shape) + { + auto input_size = input_shape.FlatSize(); + input_quantized.resize(input_size); + + assert(weights_shape.DimensionsCount() == 2); + int batch_size = input_size / weights_shape.Dims(1); + scaling_factors.resize(batch_size); + prepared = true; + } + +public: + bool prepared; + std::vector<int8_t> input_quantized; + std::vector<float> scaling_factors; + std::vector<int32_t> accum_scratch; +}; + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &, const float *bias_data, + const Shape &, float *output_data) +{ + int total_input_size = input_shape.FlatSize(); + int input_size = weights_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = weights_shape.Dims(0); + + // Output = bias if bias tensor exists. + if (bias_data) + { + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + } + else + { + ZeroVector(output_data, batch_size * num_units); + } + + // Compute output += weight * input + MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size, + output_data, /*result_stride=*/1); + + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } +} + +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data) +{ + UNUSED_RELEASE(input_shape); + UNUSED_RELEASE(bias_shape); + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(filter_shape.DimensionsCount() >= 2); + assert(output_shape.DimensionsCount() >= 1); + + assert(output_activation_min <= output_activation_max); + // TODO(benoitjacob): This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int output_dim_count = output_shape.DimensionsCount(); + const int filter_dim_count = filter_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); + const int output_depth = + MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); + const int accum_depth = filter_shape.Dims(filter_dim_count - 1); + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + int32_t acc = 0; + for (int d = 0; d < accum_depth; ++d) + { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) + { + acc += bias_data[out_c]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc); + } + } +} + +inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const int8_t *filter_data, const Shape &, const float *bias_data, + const Shape &output_shape, float *output_data, + FCTempArena &temp_arena, ruy::Context *ruy_context) +{ + int total_input_size = input_shape.FlatSize(); + const int input_size = filter_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = filter_shape.Dims(0); + + // Output = bias if bias tensor exists. + if (bias_data) + { + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + } + else + { + ZeroVector(output_data, batch_size * num_units); + } + + // Save matrix multiplication computation for all zero input. + if (IsZeroVector(input_data, total_input_size)) + { + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + return; + } + + // Quantize input from float to uint8 + quantization params (scaling factor). + float unused_min, unused_max; + float *scaling_factors_ptr = temp_arena.scaling_factors.data(); + int8_t *quant_data = temp_arena.input_quantized.data(); + + // Quantize each batch independently. + for (int b = 0; b < batch_size; ++b) + { + const int offset = b * input_size; + SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min, + &unused_max, &scaling_factors_ptr[b]); + // Incorporate scaling of the filter. + scaling_factors_ptr[b] *= params.weights_scale; + } + +// Compute output += weight * quantized_input +#ifdef USE_RUY_GEMV + auto output_size = output_shape.FlatSize(); + temp_arena.accum_scratch.resize(output_size); + int32_t *scratch = temp_arena.accum_scratch.data(); + MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data, + scaling_factors_ptr, batch_size, scratch, output_data, + /*result_stride=*/1, ruy_context); +#else + MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data, + scaling_factors_ptr, batch_size, output_data, + /*result_stride=*/1); + UNUSED_RELEASE(ruy_context); + UNUSED_RELEASE(output_shape); +#endif + + // Apply activation function to floats. + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } + return; +} + +inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms, + const Shape &input_shape, const float *input_data, + const Shape &weights_shape, const float *weights_data, + const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data, + const uint16_t *w1_segments, + const uint16_t *w1_indices) +{ + UNUSED_RELEASE(params); + UNUSED_RELEASE(input_shape); + + assert(weights_shape.DimensionsCount() == 2); + assert(output_shape.DimensionsCount() == 2); + + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + + UNUSED_RELEASE(bias_shape); + if (bias_data) + { + VectorBatchVectorAssign(bias_data, output_depth, batches, output_data); + } + else + { + ZeroVector(output_data, batches * output_depth); + } + for (int b = 0; b < batches; ++b) + { + for (int idx_0 = 0; idx_0 < output_depth; ++idx_0) + { + for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + int idx_1 = w1_indices[pw1]; + output_data[b * output_depth + idx_0] += + weights_data[pw1] * input_data[b * accum_depth + idx_1]; + } + } + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FULLY_CONNECTED_H__ diff --git a/compute/cker/include/cker/operation/FullyConnectedDense16x1.h b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h new file mode 100644 index 000000000..a7e9efd7f --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Copyright (c) 2018 Mozilla + 2008-2011 Octasic Inc. + 2012-2017 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ +#define __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/TensorUtils.h" + +namespace nnfw +{ +namespace cker +{ +#if defined(__aarch64__) && defined(USE_NEON) +inline void FullyConnected16x1Float32(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &, + const float *bias_data, const Shape &, float *output_data) +{ + int total_input_size = input_shape.FlatSize(); + int input_size = weights_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = weights_shape.Dims(0); + + float *out = output_data; + const float *weights = weights_data; + int rows = num_units; + int cols = input_size; + int col_stride = input_size; + const float *x = input_data; + + // Output = bias if bias tensor exists. + if (bias_data) + { + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + } + else + { + ZeroVector(output_data, batch_size * num_units); + } + + // rows : out, cols : in + int i, j; + for (i = 0; i < rows; i += 16) + { + const float *w = &weights[i * col_stride]; + + /* keep y[0..15] in registers for duration of inner loop */ + float *__restrict y = &out[i]; + + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + float32x4_t y8_11 = vld1q_f32(&y[8]); + float32x4_t y12_15 = vld1q_f32(&y[12]); + + for (j = 0; j < cols; j++) + { + float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; + float32x4_t xj; + + xj = vld1q_dup_f32(&x[j]); + + wvec0_3 = vld1q_f32(&w[0]); + y0_3 = vmlaq_f32(y0_3, wvec0_3, xj); + wvec4_7 = vld1q_f32(&w[4]); + y4_7 = vmlaq_f32(y4_7, wvec4_7, xj); + wvec8_11 = vld1q_f32(&w[8]); + y8_11 = vmlaq_f32(y8_11, wvec8_11, xj); + wvec12_15 = vld1q_f32(&w[12]); + y12_15 = vmlaq_f32(y12_15, wvec12_15, xj); + + w += 16; + } + + /* save y[0..15] back to memory */ + + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + vst1q_f32(&y[8], y8_11); + vst1q_f32(&y[12], y12_15); + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } +} +#endif +} // namespace cker +} // namespace nnfw +#endif // __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h new file mode 100644 index 000000000..28ae7a3bc --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Copyright (c) 2018 Mozilla + 2008-2011 Octasic Inc. + 2012-2017 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ +#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/TensorUtils.h" + +namespace nnfw +{ +namespace cker +{ +inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams ¶ms, + const Shape &input_shape, const float *input_data, + const Shape &weights_shape, const float *weights_data, + const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data, + const uint16_t *w1_segments, const uint16_t *w1_indices) +{ + UNUSED_RELEASE(input_shape); + + assert(weights_shape.DimensionsCount() == 2); + assert(output_shape.DimensionsCount() == 2); + + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + + UNUSED_RELEASE(bias_shape); + if (bias_data) + { + VectorBatchVectorAssign(bias_data, output_depth, batches, output_data); + } + else + { + ZeroVector(output_data, batches * output_depth); + } + for (int b = 0; b < batches; ++b) + { + int depth_size = output_depth / 16; + for (int idx_0 = 0; idx_0 < depth_size; ++idx_0) +#ifdef USE_NEON + { + float *__restrict y; + y = &output_data[b * output_depth + idx_0 * 16]; + /* keep y[0..15] in registers for duration of inner loop */ + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + float32x4_t y8_11 = vld1q_f32(&y[8]); + float32x4_t y12_15 = vld1q_f32(&y[12]); + for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + auto idx_1 = w1_indices[pw1]; + float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]); + float32x4_t wvec; + + wvec = vld1q_f32(&weights_data[0]); + y0_3 = vmlaq_f32(y0_3, wvec, xj); + wvec = vld1q_f32(&weights_data[4]); + y4_7 = vmlaq_f32(y4_7, wvec, xj); + wvec = vld1q_f32(&weights_data[8]); + y8_11 = vmlaq_f32(y8_11, wvec, xj); + wvec = vld1q_f32(&weights_data[12]); + y12_15 = vmlaq_f32(y12_15, wvec, xj); + + weights_data += 16; + } + /* save y[0..15] back to memory */ + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + vst1q_f32(&y[8], y8_11); + vst1q_f32(&y[12], y12_15); + } +#else + { + for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + float *__restrict y; + float xj; + auto idx_1 = w1_indices[pw1]; + xj = input_data[b * accum_depth + idx_1]; + y = &output_data[b * output_depth + idx_0 * 16]; + y[0] += weights_data[0] * xj; + y[1] += weights_data[1] * xj; + y[2] += weights_data[2] * xj; + y[3] += weights_data[3] * xj; + y[4] += weights_data[4] * xj; + y[5] += weights_data[5] * xj; + y[6] += weights_data[6] * xj; + y[7] += weights_data[7] * xj; + y[8] += weights_data[8] * xj; + y[9] += weights_data[9] * xj; + y[10] += weights_data[10] * xj; + y[11] += weights_data[11] * xj; + y[12] += weights_data[12] * xj; + y[13] += weights_data[13] * xj; + y[14] += weights_data[14] * xj; + y[15] += weights_data[15] * xj; + weights_data += 16; + } + } +#endif + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data); + } +} +} // namespace cker +} // namespace nnfw +#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h new file mode 100644 index 000000000..d17a5796b --- /dev/null +++ b/compute/cker/include/cker/operation/FusedBatchNorm.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FUSEDBATCHNORM_H__ +#define __NNFW_CKER_FUSEDBATCHNORM_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/operation/Helper/Tensor.h" +#include "cker/operation/Helper/MatmulBCast.h" + +#include "Transpose.h" +#include "BatchMatMul.h" + +#include <string> +#include <vector> +#include <map> +#include <numeric> +#include <algorithm> + +namespace nnfw +{ +namespace cker +{ + +class FusedBatchNorm +{ +public: + FusedBatchNorm() : _prepared(false) + { + // DO NOTHING + } + + void prepare() { _prepared = true; } + + void operator()(const std::vector<Shape> &input_shapes, + const std::vector<const float *> &input_data, const Shape &output_shape, + float *output_data, FusedBatchNormParams param) + { + // TODO: support fused_batch_norm if is_traninig is false + assert(param.is_training == true); + + // TODO: support case where dim[1] != 1 or dim[3] !=1. + // Here we only support input tensor of [B, 1, X, 1] shape + assert(input_shapes[0].Dims(1) == 1 && input_shapes[0].Dims(3) == 1); + + if (!_prepared) + + { + prepare(); + } + + Tensor transformed_input[5]; + Tensor transformed_output; + + const int num_inputs = input_shapes.size(); + std::vector<InputTensor<float>> inputs(num_inputs); + for (int i = 0; i < num_inputs; i++) + { + inputs[i].shape.ReplaceWith(input_shapes[i].DimensionsCount(), input_shapes[i].DimsData()); + inputs[i].buffer = input_data[i]; + copyFrom<float>(inputs[i], inputs[i].shape, &transformed_input[i]); + } + + InputTensor<float> output; + output.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData()); + output.buffer = output_data; + copyFrom<float>(output, output.shape, &transformed_output); + + // TODO: support transpose if data_format is NCHW + // Here, Eigen use RowMajor kernel(NHWC) + + typename TTypes<float, 4>::Tensor x(transformed_input[0].shaped<float, 4>()); + typename TTypes<float, 4>::Tensor y(transformed_output.shaped<float, 4>()); + typename TTypes<float, 1>::Tensor scale(transformed_input[1].shaped<float, 1>()); + typename TTypes<float, 1>::Tensor offset(transformed_input[2].shaped<float, 1>()); + + const int depth = x.dimension(3); + const int size = x.size(); + const int rest_size = size / depth; + Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth); + + Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth); + Eigen::array<int, 1> reduce_dims({0}); + Eigen::array<int, 2> bcast_spec({rest_size, 1}); + + auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<float>(); + const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1; + float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size)); + // This adjustment is for Bessel's correction + float rest_size_adjust = + static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one); + + Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth); + Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth); + + const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice(); + + batch_mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv); + auto x_centered = x_rest_by_depth - batch_mean.reshape(one_by_depth).broadcast(bcast_spec); + + batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv; + auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale) + .eval() + .reshape(one_by_depth) + .broadcast(bcast_spec); + auto x_scaled = x_centered * scaling_factor; + auto x_shifted = + (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>(); + + UNUSED_RELEASE(rest_size_adjust); + + y.reshape(rest_by_depth).device(d) = x_shifted; + + memcpy(output_data, y.data(), output_shape.FlatSize() * sizeof(float)); + } + + template <typename T> + void copyFrom(const InputTensor<T> &input, const Shape &shape, Tensor *output) + { + Tensor temp_tensor; + temp_tensor.shape.ReplaceWith(input.shape.DimensionsCount(), input.shape.DimsData()); + temp_operand.emplace_back(std::make_unique<float[]>(input.shape.FlatSize())); + temp_tensor.buffer = temp_operand.back().get(); + memcpy(temp_tensor.buffer, input.buffer, input.shape.FlatSize() * sizeof(float)); + + copyFrom(temp_tensor, shape, output); + } + + void copyFrom(const Tensor &input, const Shape &shape, Tensor *output) + { + if (output->copyFrom(input, shape)) + return; + + throw std::runtime_error{"Einsum: Encountered error while reshaping a Tensor"}; + } + +private: + bool _prepared; + std::vector<std::unique_ptr<float[]>> temp_operand; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FUSEDBATCHNORM_H__ diff --git a/compute/cker/include/cker/operation/Gather.h b/compute/cker/include/cker/operation/Gather.h new file mode 100644 index 000000000..65a71887e --- /dev/null +++ b/compute/cker/include/cker/operation/Gather.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_GATHER_H__ +#define __NNFW_CKER_GATHER_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T, typename CoordsT = int32_t> +inline void Gather(const GatherParams &op_params, const Shape &input_shape, const T *input_data, + const Shape &coords_shape, const CoordsT *coords_data, const Shape &, + T *output_data) +{ + int axis = op_params.axis; + if (axis < 0) + { + axis += input_shape.DimensionsCount(); + } + assert(axis >= 0); + assert(axis < input_shape.DimensionsCount()); + const int axis_size = input_shape.Dims(axis); + const int coords_count = coords_shape.FlatSize(); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int inner_size = 1; + for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) + { + inner_size *= input_shape.Dims(i); + } + + for (int outer = 0; outer < outer_size; ++outer) + { + for (int i = 0; i < coords_count; ++i) + { + assert(coords_data[i] >= 0); + assert(coords_data[i] < axis_size); + std::memcpy(output_data + (outer * coords_count + i) * inner_size, + input_data + (outer * axis_size + coords_data[i]) * inner_size, + sizeof(T) * inner_size); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_GATHER_H__ diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h new file mode 100644 index 000000000..a0abf2935 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/BCast.h @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_BCAST_H__ +#define __NNFW_CKER_HELPER_BCAST_H__ + +/** + * ToDo : This file will be moved into upper folder when integrate with other + * custom operations. + * And It should merged with EinsumHelper's BCast. +**/ + +#include "cker/Shape.h" +#include "cker/eigen/EigenSupport.h" + +namespace nnfw +{ +namespace cker +{ +// Returns the mapping from the output batch indices to the corresponding +// input's batch indices, given the input's "reshape" and "bcast" shapes as +// returned by the BCastList helper class. The i'th element denotes the +// (flattened) batch index of the input that must be used to compute the i'th +// batch output. +// +inline void ComputeBatchIndices(const int32_t output_batch_size, + const std::vector<int32_t> &reshape, + const std::vector<int32_t> &bcast, + std::vector<int32_t> *out_indices) +{ + // Populates the mapping in out_indices. This algorithm is identical to + // the following steps: + // - Reshape {0, 1, ..., input_batch_size - 1} to the input shape. + // - Broadcast to the output shape. + // - Reshape back to a flat 1D vector. + out_indices->resize(output_batch_size); + int32_t num_output_elements = 1; + int32_t num_input_elements = 1; + for (int32_t i = reshape.size() - 1; i >= 0; --i) + { + // Replicate the already populated mapping an additional (dim - 1) times. + // If we are broadcasting, just copy the existing mapping. + // Otherwise, add another dimension from the input shape. + const int32_t dim = std::max(reshape[i], bcast[i]); + const int32_t incr = bcast[i] > 1 ? 0 : num_input_elements; + for (int32_t k = 0; k < (dim - 1) * num_output_elements; ++k) + { + (*out_indices)[num_output_elements + k] = (*out_indices)[k] + incr; + } + num_output_elements *= dim; + num_input_elements *= reshape[i]; + } +} + +template <int N> class BCastList +{ +public: + // A vector of int32_t representing the shape of tensor. The 0-th + // element is the outer-most dimension and the last element is the + // inner-most dimension. Note that we do not use Shape since + // it's more convenient to manipulate Vec directly for this module. + typedef std::vector<int32_t> Vec; + + // Constructs all helper shapes, following the aforementioned rules. + // + // If "fewer_dims_optimization" is set to true (the default), the + // implementation tries to reduce intermediate dimensions needed to be more + // efficient. This is transparent to the caller. + // + // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have + // the same number of dimensions as the larger of the two inputs. + // + // If return_flattened_batch_indices is true, the implementation will compute + // for each output member of the flattened output, which batch indices of + // each input correspond to it. This is disabled by default. + explicit BCastList(const Vec (&x)[N], const bool fewer_dims_optimization = true, + const bool return_flattened_batch_indices = false); + ~BCastList() {} + + // Returns true iff two operands are compatible according to the + // broadcasting rule. + bool IsValid() const { return valid_; } + bool IsBroadcastingRequired() const { return broadcasting_required_; } + + // If and only if IsValid(), the following fields can be used in + // implementing a broadcasted binary tensor operation according to + // the broadcasting rule. + const Vec &reshape(int i) const { return reshape_[i]; } + const Vec &bcast(int i) const { return bcast_[i]; } + const Vec &result_shape() const { return result_; } + const Vec &output_shape() const { return output_; } + const Vec &grad_reduce_idx(int i) const { return grad_reduce_idx_[i]; } + int32_t output_batch_size() const { return output_batch_size_; } + + // Returns the mapping from the flattened output batch indices to x's + // flattened batch indices. The result is a vector of length + // output_batch_size(). To compute the i'th batch output, a binary matmul-like + // operation should use the `x_batch_indices()[i]`th batch index of `x`. + // Note: Returns an empty vector if broadcasting is not required. Callers + // should only use this when IsBroadcastingRequired() returns true. + const std::vector<int32_t> &batch_indices(int i) const { return batch_indices_[i]; } + +protected: + bool valid_ = true; + bool broadcasting_required_ = true; + Vec reshape_[N]; + Vec bcast_[N]; + Vec result_; + Vec output_; + Vec grad_reduce_idx_[N]; + + int32_t output_batch_size_; + std::vector<int32_t> batch_indices_[N]; + + static void Reverse(Vec *shape) { std::reverse(shape->begin(), shape->end()); } +}; // BCastList<N> + +template <int N> +BCastList<N>::BCastList(const BCastList::Vec (&x)[N], const bool fewer_dims_optimization, + const bool return_flattened_batch_indices) +{ + typedef BCastList::Vec Vec; + bool all_equal = true; + size_t largest_rank = 0; + output_batch_size_ = 1; + for (int i = 0; i < N; ++i) + { + if (x[i] != x[0]) + { + all_equal = false; + } + if (x[i].size() > largest_rank) + { + largest_rank = x[i].size(); + } + } + if (all_equal) + { + broadcasting_required_ = false; + } + if (all_equal && fewer_dims_optimization) + { + // Fast path for common case of identical shapes. + int32_t elements = 1; + const int rank = x[0].size(); + output_.resize(rank); + for (int i = 0; i < rank; i++) + { + const int32_t dim = x[0][i]; + elements *= dim; + output_[i] = dim; + } + result_.push_back(elements); + output_batch_size_ = elements; + for (int i = 0; i < N; ++i) + { + reshape_[i].push_back(elements); + bcast_[i].push_back(1); + } + // grad_reduce_ is left as empty + return; + } + + // Reverse all the shapes for convenience + // After the reverse, 0-th is the inner-most dimension. + Vec copy[N]; + for (int i = 0; i < N; ++i) + { + copy[i] = x[i]; + Reverse(©[i]); + } + + // 1-extend and align all vectors. + for (int i = 0; i < N; ++i) + { + if (copy[i].size() < largest_rank) + { + copy[i].resize(largest_rank, 1); + } + } + // Going through each dimension starting from the inner-most + // dimension, compares dimension of x and y. They are compatible if + // they are equal or either is 1. + + // indices of j-th component of each input. + bool prev_is_one[N]; + bool current_is_one[N]; + for (int i = 0; i < N; ++i) + { + prev_is_one[i] = false; + current_is_one[i] = false; + } + Vec output; + bool output_dim_set = false; + int output_dim = -1; + bool none_is_one = true; + bool set_one = false; + for (size_t j = 0; j < largest_rank; ++j) + { + output_dim = -1; + output_dim_set = false; + none_is_one = true; + // Find which indices are 1. + for (int i = 0; i < N; ++i) + { + // Keep track of which indices are 1. + if (copy[i][j] == 1) + { + current_is_one[i] = true; + none_is_one = false; + } + else + { + current_is_one[i] = false; + if (!output_dim_set || copy[i][j] == output_dim) + { + output_dim = copy[i][j]; + output_dim_set = true; + } + else + { + valid_ = false; + return; + } + } + } + output_.push_back(output_dim_set ? output_dim : 1); + output_batch_size_ *= output_.back(); + // All dimensions are 1. + if (!output_dim_set) + { + if (!fewer_dims_optimization) + { + for (int i = 0; i < N; ++i) + { + bcast_[i].push_back(1); + reshape_[i].push_back(1); + } + result_.push_back(1); + } + for (int i = 0; i < N; ++i) + { + grad_reduce_idx_[i].push_back(largest_rank - 1 - j); + } + // This will skip updating the previous state to the current one. We'll + // explain why this is safe below. + // Consider the previous state P, current state C and the next state N. + // In the case where N also is all ones (N == C), we'll do the same + // optimization here (push back one dimensions if we need to), which is + // safe and is expected. + // + // When N != C, we'll continue as usual. However, we might trigger the + // next block if N == P (because we didn't update the previous state). + // We trigger the next block if `fewer_dims_optimization` is true. + // This means that we did not modify and broadcast / reshapes in this + // block (we skipped updating, since the one dimensions can be ignored). + // In essence, we only need to check whether the previous non-one state is + // equal to the current non-one state. + + continue; + } + else if ((fewer_dims_optimization) && + std::equal(current_is_one, current_is_one + N, prev_is_one) && set_one) + { + // It is a run of the same broadcasting case as last time. + // We can reshape the input so that fewer dimensions + // are involved in the intermediate computation. + result_.back() *= output_dim; + for (int i = 0; i < N; ++i) + { + reshape_[i].back() *= copy[i][j]; + bcast_[i].back() *= current_is_one[i] ? output_dim : 1; + if (current_is_one[i] && !none_is_one) + { + grad_reduce_idx_[i].push_back(largest_rank - 1 - j); + } + } + } + else + { + result_.push_back(output_dim); + for (int i = 0; i < N; ++i) + { + reshape_[i].push_back(copy[i][j]); + bcast_[i].push_back(current_is_one[i] ? output_dim : 1); + if (current_is_one[i] && !none_is_one) + { + grad_reduce_idx_[i].push_back(largest_rank - 1 - j); + } + } + } + set_one = true; + for (int i = 0; i < N; ++i) + { + prev_is_one[i] = current_is_one[i]; + } + } + if (result_.empty()) + { + result_.push_back(1); + for (int i = 0; i < N; ++i) + { + reshape_[i].push_back(1); + bcast_[i].push_back(1); + } + } + // Do something about batches. + for (int i = 0; i < N; ++i) + { + Reverse(&reshape_[i]); + Reverse(&bcast_[i]); + Reverse(&grad_reduce_idx_[i]); + } + Reverse(&result_); + Reverse(&output_); + // Only compute batch indices when we need broadcasting, and we aren't doing + // needless work (when the output size is 0 or the + // return_flattened_batch_indices isn't enabled). + if (return_flattened_batch_indices && broadcasting_required_ && output_batch_size_ > 0) + { + for (int i = 0; i < N; ++i) + { + ComputeBatchIndices(output_batch_size_, reshape_[i], bcast_[i], &batch_indices_[i]); + } + } +} + +// BCast is a helper for broadcasting binary tensor operation. +// TensorFlow's broadcasting rule follows that of numpy (See +// http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html). +// +// The rule has the following properties: +// +// 1. suffix matching: the rule starts with the right-most +// dimension, and works towards the left-most dimension. Since +// TensorFlow is row-major, the right-most dimension (the last +// element in the shape of a tensor) is the inner-most, a.k.a. +// the fastest changing, dimension. +// +// 2. Two dimensions are compatible for broadcasting if both are the +// same or either is 1. +// +// BCast takes the shape of two tensors and computes a few vectors of +// int32 that are useful for the caller to reshape the tensors, apply +// the right broadcasts to them, compute the broadcasted operation, +// and possibly the gradients. In a nutshell, the caller is expected +// to compute the broadcasted operation as following: +// +// BCast b(x.shape(), y.shape()); +// output = x.reshape(b.x_reshape()).broadcast(b.x_bcast()) +// _op_ +// y.reshape(b.y_reshape()).broadcast(b.y_bcast()) +// +// For the gradient computation, +// grad_x = sum(grad * backprop_x(x, y), grad_x_reduce_idx) +// .reshape(x.shape()) +// grad_y = sum(grad * backprop_y(x, y), grad_y_reduce_idx) +// .reshape(y.shape()) +// backprop_x and backprop_y are functionals of the binary function "op", +// e.g., +// for +, backprop_x(x, y) = backprop_y(x, y) = 1; +// for *, backprop_x(x, y) = y, backprop_y(x, y) = x; +// for /, backprop_x(x, y) = 1/y, backprop_y(x, y) = -x/y^2; +// +// The multiplication in the grad * backprop_x itself is also +// broadcasting following the same rule. +class BCast : public BCastList<2> +{ +public: + // Constructs all helper shapes, following the aforementioned rules. + // + // If "fewer_dims_optimization" is set to true (the default), the + // implementation tries to reduce intermediate dimensions needed to be more + // efficient. This is transparent to the caller. + // + // If false, all intermediate shapes (except for grad_{x,y}_reduce_idx()) have + // the same number of dimensions as the larger of the two inputs. + typedef std::vector<int32_t> Vec; + + BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true, + const bool return_flattened_batch_indices = false) + : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices) + { + } + + ~BCast() {} + + // If and only if IsValid(), the following fields can be used in + // implementing a broadcasted binary tensor operation according to + // the broadcasting rule. + const Vec &x_reshape() const { return reshape_[0]; } + const Vec &x_bcast() const { return bcast_[0]; } + const Vec &y_reshape() const { return reshape_[1]; } + const Vec &y_bcast() const { return bcast_[1]; } + const Vec &result_shape() const { return result_; } + const Vec &output_shape() const { return output_; } + const Vec &grad_x_reduce_idx() const { return grad_reduce_idx_[0]; } + const Vec &grad_y_reduce_idx() const { return grad_reduce_idx_[1]; } + + // Returns the mapping from the flattened output batch indices to x's + // flattened batch indices. The result is a vector of length + // output_batch_size(). To compute the i'th batch output, a binary matmul-like + // operation should use the `x_batch_indices()[i]`th batch index of `x`. + // Note: Returns an empty vector if broadcasting is not required. Callers + // should only use this when IsBroadcastingRequired() returns true. + const std::vector<int32_t> &x_batch_indices() const { return batch_indices_[0]; } + // Returns the mapping from the flattened output batch indices to y's + // flattened batch indices. Similar to x_batch_indices(). + // Note: Returns an empty vector if broadcasting is not required. Callers + // should only use this when IsBroadcastingRequired() returns true. + const std::vector<int32_t> &y_batch_indices() const { return batch_indices_[1]; } + + template <typename IndexType, int NDIMS> + static Eigen::array<IndexType, NDIMS> ToIndexArrayType(const BCast::Vec &vec) + { + assert(vec.size() == NDIMS); + Eigen::array<IndexType, NDIMS> ret; + for (int i = 0; i < NDIMS; ++i) + ret[i] = vec[i]; + return ret; + } + + template <int NDIMS> + static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(const BCast::Vec &vec) + { + return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec); + } + + // Static helpers. + static Vec FromShape(const Shape &shape) + { + const int N = shape.DimensionsCount(); + BCastList::Vec ret(N); + for (int i = 0; i < N; ++i) + { + ret[i] = shape.Dims(i); + } + return ret; + } + + static Shape ToShape(const BCastList::Vec &vec) + { + const int N = vec.size(); + Shape shape(N); + + for (int i = 0; i < N; ++i) + { + shape.SetDim(i, vec[i]); + } + return shape; + } + +}; // BCast +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_HELPER_BCAST_H__ diff --git a/compute/cker/include/cker/operation/Helper/MatmulBCast.h b/compute/cker/include/cker/operation/Helper/MatmulBCast.h new file mode 100644 index 000000000..b80ccc0d0 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/MatmulBCast.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__ +#define __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__ + +#include <vector> +#include <memory> +#include <numeric> + +#include "BCast.h" +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +// Simple wrapper over BCast specialized for MatMul. +// Provides utilities for broadcasting across batch dimensions for binary +// MatMul-like operations. + +// Fix: Use Shape directly instead of Vec +class MatMulBCast +{ +public: + MatMulBCast(Shape &shape_x, Shape &shape_y) + { + if (shape_x.DimensionsCount() < 2 || shape_y.DimensionsCount() < 2) + return; + + std::vector<int32_t> x; + std::vector<int32_t> y; + + x.resize(shape_x.DimensionsCount() - 2); + y.resize(shape_y.DimensionsCount() - 2); + + for (size_t i = 0; i < x.size(); i++) + { + x[i] = shape_x.Dims(i); + } + for (size_t i = 0; i < y.size(); i++) + { + y[i] = shape_y.Dims(i); + } + + _batch_bcast = std::make_unique<BCast>(std::move(x), std::move(y)); + if (!_batch_bcast->IsValid()) + return; + + auto x_reshaped = _batch_bcast->x_reshape(); + auto y_reshaped = _batch_bcast->y_reshape(); + auto output_shape = _batch_bcast->output_shape(); + + _x_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1), + std::multiplies<int32_t>()); + _y_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1), + std::multiplies<int32_t>()); + _output_shape.ReplaceWith(output_shape.size(), output_shape.data()); + _output_batch_size = _output_shape.FlatSize(); + } + + bool IsValid() const { return (_batch_bcast != nullptr) && _batch_bcast->IsValid(); } + int32_t x_batch_size() const { return _x_batch_size; } + int32_t y_batch_size() const { return _y_batch_size; } + int32_t output_batch_size() const { return _output_batch_size; } + const Shape &output_batch_shape() const { return _output_shape; } + +private: + std::unique_ptr<BCast> _batch_bcast; + + int32_t _x_batch_size; + int32_t _y_batch_size; + Shape _output_shape; + int32_t _output_batch_size; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EINSUM_HELPER_MATMUL_BCAST_H__ diff --git a/compute/cker/include/cker/operation/Helper/PhiloxRandom.h b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h new file mode 100644 index 000000000..8e8879ce9 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ +#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ + +#include <stdlib.h> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +// Function qualifiers that need to work on both CPU and GPU. +#if defined(__CUDACC__) || defined(__HIPCC__) +// For nvcc. +#define PHILOX_DEVICE_FUNC __host__ __device__ +#define PHILOX_INLINE __inline__ +#else +// For non-nvcc. +#define PHILOX_DEVICE_FUNC +#define PHILOX_INLINE inline +#endif +#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE + +#include <math.h> + +namespace nnfw +{ +namespace cker +{ +namespace random +{ + +// A class that represents an inline array. It can be used on both CPU and GPU, +// and also trivially copyable between CPU and GPU. +// Arguments: +// T: the array element type; +// ElementCount: the fixed size of the array; +template <typename T, int ElementCount> class Array +{ +public: + static constexpr int kElementCount = ElementCount; + PHILOX_DEVICE_INLINE Array() + { + for (int i = 0; i < ElementCount; ++i) + { + data_[i] = T(0); + } + } + + PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; } + + PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; } + + size_t size() const { return ElementCount; } + +private: + T data_[ElementCount]; +}; + +// A class that encapsulates all the states for a random number generator using +// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits +// in the form of four uint32. +// There are multiple variants of this algorithm, we picked the 4x32_10 version +// that is most suited for our applications. +// Since this class is meant to be copied between CPU to GPU, it maintains a +// value semantics. +// +// For example: To use this class and populate an array of 1024 randoms on CPU +// with two threads, +// +// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { +// assert(start % 4 == 0); +// assert(limit % 4 == 0); +// rnd.Skip(start / 4); +// for (int i = start; i < limit; i += 4) { +// auto sample = rnd(); +// ... copy sample[0..3] to output[i..i+3] +// } +// } +// +// PhiloxRandom rng(seed); +// PhiloxRandom rng_copy = rng; +// rng.Skip(1000/4); +// +// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; +// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; +// ... wait for thread 1 & 2 to finish executing Fill(). +// +// NOTE: +// 1. PhiloxRandom is trivially copyable. +// 2. PhiloxRandom is compilable by gcc and nvcc. +class PhiloxRandom +{ +public: + using ResultType = Array<uint32_t, 4>; + using ResultElementType = uint32_t; + // The number of elements that will be returned. + static constexpr int kResultElementCount = 4; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 10; + // The type for the 64-bit key stored in the form of two 32-bit uint + // that are used in the diffusion process. + using Key = Array<uint32_t, 2>; + + PHILOX_DEVICE_INLINE + PhiloxRandom() {} + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64_t seed) + { + key_[0] = static_cast<uint32_t>(seed); + key_[1] = static_cast<uint32_t>(seed >> 32); + } + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi) + { + key_[0] = static_cast<uint32_t>(seed_lo); + key_[1] = static_cast<uint32_t>(seed_lo >> 32); + counter_[2] = static_cast<uint32_t>(seed_hi); + counter_[3] = static_cast<uint32_t>(seed_hi >> 32); + } + + PHILOX_DEVICE_INLINE + PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {} + + PHILOX_DEVICE_INLINE + ResultType const &counter() const { return counter_; } + + PHILOX_DEVICE_INLINE + Key const &key() const { return key_; } + + // Skip the specified number of samples of 128-bits in the current stream. + PHILOX_DEVICE_INLINE + void Skip(uint64_t count) + { + const uint32_t count_lo = static_cast<uint32_t>(count); + uint32_t count_hi = static_cast<uint32_t>(count >> 32); + + counter_[0] += count_lo; + if (counter_[0] < count_lo) + { + ++count_hi; + } + + counter_[1] += count_hi; + if (counter_[1] < count_hi) + { + if (++counter_[2] == 0) + { + ++counter_[3]; + } + } + } + + // Returns a group of four random numbers using the underlying Philox + // algorithm. + PHILOX_DEVICE_INLINE ResultType operator()() + { + ResultType counter = counter_; + Key key = key_; + + // Run the single rounds for ten times. Manually unrolling the loop + // for better performance. + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + + SkipOne(); + + return counter; + } + +private: + // We use the same constants as recommended by the original paper. + static constexpr uint32_t kPhiloxW32A = 0x9E3779B9; + static constexpr uint32_t kPhiloxW32B = 0xBB67AE85; + static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53; + static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57; + + // Helper function to skip the next sample of 128-bits in the current stream. + PHILOX_DEVICE_INLINE void SkipOne() + { + if (++counter_[0] == 0) + { + if (++counter_[1] == 0) + { + if (++counter_[2] == 0) + { + ++counter_[3]; + } + } + } + } + + // Helper function to return the lower and higher 32-bits from two 32-bit + // integer multiplications. + PHILOX_DEVICE_INLINE + static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high) + { +#ifndef __CUDA_ARCH__ + const uint64_t product = static_cast<uint64_t>(a) * b; + *result_low = static_cast<uint32_t>(product); + *result_high = static_cast<uint32_t>(product >> 32); +#else + *result_low = a * b; + *result_high = __umulhi(a, b); +#endif + } + + // Helper function for a single round of the underlying Philox algorithm. + PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter, + const Key &key) + { + uint32_t lo0; + uint32_t hi0; + MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); + + uint32_t lo1; + uint32_t hi1; + MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); + + ResultType result; + result[0] = hi1 ^ counter[1] ^ key[0]; + result[1] = lo1; + result[2] = hi0 ^ counter[3] ^ key[1]; + result[3] = lo0; + return result; + } + + PHILOX_DEVICE_INLINE void RaiseKey(Key *key) + { + (*key)[0] += kPhiloxW32A; + (*key)[1] += kPhiloxW32B; + } + +private: + ResultType counter_; + Key key_; +}; + +} // namespace random +} // namespace cker +} // namespace nnfw +#endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h new file mode 100644 index 000000000..baeafd7c9 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ +#define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ + +#include <string.h> + +#include <cmath> + +#include <algorithm> +#include <type_traits> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" +#include "cker/operation/Helper/PhiloxRandom.h" + +namespace nnfw +{ +namespace cker +{ +namespace random +{ + +// Helper function to convert a 16-bit integer to a half between [0..1). +PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x); +// Helper function to convert a 16-bit integer to a bfloat16 between [0..1). +// PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x); +// Helper function to convert a 32-bit integer to a float between [0..1). +PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x); +// Helper function to convert two 32-bit integers to a double between [0..1). +PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1); + +// Computes a + b. Requires that the result is representable in the destination +// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b +// need *not* be representable in that type. (The condition on b excludes the +// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot +// compute.) +template <typename Int> +PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) +{ + // Implementation note: both b_div_2 and b - b_div_2 are positive and + // representable as Int. + auto b_div_2 = b >> 1; + return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2); +} + +// A class that generates uniform distribution random numbers from the +// underlying random integer generator. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for the +// actual returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class Generator, typename RealType> class UniformDistribution; + +template <class Generator> class UniformDistribution<Generator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits. + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint32ToFloat(sample[i]); + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]); + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, int32_t> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<int32_t, kResultElementCount> ResultType; + typedef int32_t ResultElementType; + + // Must have lo < hi + UniformDistribution(int32_t lo, int32_t hi) + : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = SignedAdd(lo_, sample[i] % range_); + } + return result; + } + +private: + // Note that lo_ is intentionally signed while range_ is intentionally + // unsigned. This is because hi - lo can overflow signed integers if + // lo < 0 < hi, but always fits in unsigned. + int32_t lo_; + int32_t range_; +}; + +template <class Generator> class UniformDistribution<Generator, int64_t> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<int64_t, kResultElementCount> ResultType; + typedef int64_t ResultElementType; + + // Must have lo < hi + UniformDistribution(int64_t lo, int64_t hi) + : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32; + result[i] = SignedAdd(lo_, bits % range_); + } + return result; + } + +private: + // Note that lo_ is intentionally signed while range_ is intentionally + // unsigned. This is because hi - lo can overflow signed integers if + // lo < 0 < hi, but always fits in unsigned. + int64_t lo_; + uint64_t range_; +}; + +// Similar to `UniformDistribution`, except that instead of generating numbers +// in the range [low, high), it generates numbers covering the whole range of +// the integer type. +template <typename Generator, typename IntType> class UniformFullIntDistribution; + +template <typename Generator, typename IntType> class UniformFullIntDistribution32 +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<IntType, kResultElementCount> ResultType; + typedef IntType ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = sample[i]; + } + return result; + } +}; + +template <typename Generator, typename IntType> class UniformFullIntDistribution64 +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<IntType, kResultElementCount> ResultType; + typedef IntType ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32; + } + return result; + } +}; + +template <typename Generator> +class UniformFullIntDistribution<Generator, int32_t> + : public UniformFullIntDistribution32<Generator, int32_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, uint32_t> + : public UniformFullIntDistribution32<Generator, uint32_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, int64_t> + : public UniformFullIntDistribution64<Generator, int64_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, uint64_t> + : public UniformFullIntDistribution64<Generator, uint64_t> +{ +}; + +// A class that adapts the underlying native multiple samples to return a single +// sample at a time. +template <class Generator> class SingleSampleAdapter +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = 1; + // The number of elements that will be returned by the underlying generator. + static constexpr int kNativeElementCount = Generator::kResultElementCount; + typedef typename Generator::ResultElementType ResultType; + typedef typename Generator::ResultElementType ResultElementType; + + PHILOX_DEVICE_INLINE + explicit SingleSampleAdapter(Generator *gen) + : generator_(gen), used_result_index_(Generator::kResultElementCount) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()() + { + if (used_result_index_ == Generator::kResultElementCount) + { + unused_results_ = (*generator_)(); + used_result_index_ = 0; + } + + return unused_results_[used_result_index_++]; + } + + PHILOX_DEVICE_INLINE + void Skip(uint64_t num_skips) + { + if (!num_skips) + { + return; + } + int num_unused_results = kNativeElementCount - used_result_index_; + if (num_skips <= num_unused_results) + { + used_result_index_ += num_skips; + return; + } + num_skips -= num_unused_results; + used_result_index_ = kNativeElementCount; + SkipFromGenerator(num_skips / kNativeElementCount); + num_skips = num_skips % kNativeElementCount; + if (num_skips) + { + unused_results_ = (*generator_)(); + used_result_index_ = num_skips; + } + } + +private: + // This implementation iteratively skips over `num_skips` samples + // from `generator_`. There is an O(1) implementation for PhiloxRandom + // in random_distributions.cc. + PHILOX_DEVICE_INLINE + void SkipFromGenerator(uint64_t num_skips) + { + while (num_skips--) + { + (*generator_)(); + } + } + + Generator *generator_; + typename Generator::ResultType unused_results_; + int used_result_index_; +}; + +// A class that generates unit normal distribution random numbers from the +// underlying random integer generator. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// each invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for actual +// returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class Generator, typename RealType> class NormalDistribution; + +PHILOX_DEVICE_INLINE +void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1); + +PHILOX_DEVICE_INLINE +void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1); + +// Exactly like the float version, except that we convert to half afterwards; +// since we don't have half-precision sin/cos even on GPUs, there's nothing to +// gain from working in half internally. +template <class Generator> class NormalDistribution<Generator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + float f[2]; + BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]); + result[i] = Eigen::half(f[0]); + result[i + 1] = Eigen::half(f[1]); + } + return result; + } +}; + +template <class Generator> class NormalDistribution<Generator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]); + } + return result; + } +}; + +template <class Generator> class NormalDistribution<Generator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + const int i2 = 2 * i; + BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i], + &result[i + 1]); + } + return result; + } +}; + +// A class that returns standard normal distribution between +// [-kTruncateValue, kTruncateValue]. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// each invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for actual +// returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution; + +// Exactly like the float version, except that we convert to half afterwards; +// since we don't have half-precision sin/cos even on GPUs, there's nothing to +// gain from working in half internally. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + // The threshold where the normal distribution is truncated. + const float kTruncateValue = 2.0f; + + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (true) + { + // Repeatedly take samples from the normal distribution, until we have + // the desired number of elements that fall within the pre-defined cutoff + // threshold. + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + float f[2]; + BoxMullerFloat(x0, x1, &f[0], &f[1]); + + if (Eigen::numext::abs(f[0]) < kTruncateValue) + { + results[index++] = Eigen::half(f[0]); + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(f[1]) < kTruncateValue) + { + results[index++] = Eigen::half(f[1]); + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Partial specialization for float. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + // The threshold where the normal distribution is truncated. + const float kTruncateValue = 2.0f; + + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (true) + { + // Repeatedly take samples from the normal distribution, until we have + // the desired number of elements that fall within the pre-defined cutoff + // threshold. + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + float f[2]; + BoxMullerFloat(x0, x1, &f[0], &f[1]); + + if (Eigen::numext::abs(f[0]) < kTruncateValue) + { + results[index++] = f[0]; + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(f[1]) < kTruncateValue) + { + results[index++] = f[1]; + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Partial specialization for double. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1) + ? SingleSampleGenerator::kNativeElementCount / 2 + : 1; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + const double kTruncateValue = 2.0; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (1) + { + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + const uint32_t x2 = (*gen)(); + const uint32_t x3 = (*gen)(); + double d[2]; + BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]); + + if (Eigen::numext::abs(d[0]) < kTruncateValue) + { + results[index++] = d[0]; + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(d[1]) < kTruncateValue) + { + results[index++] = d[1]; + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Helper function to convert two 32-bit uniform integers to two floats +// under the unit normal distribution. +PHILOX_DEVICE_INLINE +void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1) +{ + // This function implements the Box-Muller transform: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form + // Do not send a really small number to log(). + // We cannot mark "epsilon" as "static const" because NVCC would complain + const float epsilon = 1.0e-7f; + float u1 = Uint32ToFloat(x0); + if (u1 < epsilon) + { + u1 = epsilon; + } + const float v1 = 2.0f * M_PI * Uint32ToFloat(x1); + const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *f0 = Eigen::numext::sin(v1); + *f1 = Eigen::numext::cos(v1); +#else + sincosf(v1, f0, f1); +#endif + *f0 *= u2; + *f1 *= u2; +} + +// Helper function to convert four 32-bit uniform integers to two doubles +// under the unit normal distribution. +PHILOX_DEVICE_INLINE +void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1) +{ + // This function implements the Box-Muller transform: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form + // Do not send a really small number to log(). + // We cannot mark "epsilon" as "static const" because NVCC would complain + const double epsilon = 1.0e-7; + double u1 = Uint64ToDouble(x0, x1); + if (u1 < epsilon) + { + u1 = epsilon; + } + const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3); + const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *d0 = Eigen::numext::sin(v1); + *d1 = Eigen::numext::cos(v1); +#else + sincos(v1, d0, d1); +#endif + *d0 *= u2; + *d1 *= u2; +} + +// Helper function to convert an 16-bit integer to a half between [0..1). +PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x) +{ + // IEEE754 halfs are formatted as follows (MSB first): + // sign(1) exponent(5) mantissa(10) + // Conceptually construct the following: + // sign == 0 + // exponent == 15 -- an excess 15 representation of a zero exponent + // mantissa == 10 random bits + const uint16_t man = x & 0x3ffu; // 10 bit mantissa + const uint16_t exp = static_cast<uint16_t>(15); + const uint16_t val = (exp << 10) | man; + + Eigen::half result; + result.x = val; + return result - Eigen::half(1.0); +} + +// Helper function to convert an 32-bit integer to a float between [0..1). +PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x) +{ + // IEEE754 floats are formatted as follows (MSB first): + // sign(1) exponent(8) mantissa(23) + // Conceptually construct the following: + // sign == 0 + // exponent == 127 -- an excess 127 representation of a zero exponent + // mantissa == 23 random bits + const uint32_t man = x & 0x7fffffu; // 23 bit mantissa + const uint32_t exp = static_cast<uint32_t>(127); + const uint32_t val = (exp << 23) | man; + + // Assumes that endian-ness is same for float and uint32. + float result; + memcpy(&result, &val, sizeof(val)); + return result - 1.0f; +} + +// Helper function to convert two 32-bit integers to a double between [0..1). +PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1) +{ + // IEEE754 doubles are formatted as follows (MSB first): + // sign(1) exponent(11) mantissa(52) + // Conceptually construct the following: + // sign == 0 + // exponent == 1023 -- an excess 1023 representation of a zero exponent + // mantissa == 52 random bits + const uint32_t mhi = x0 & 0xfffffu; // upper 20 bits of mantissa + const uint32_t mlo = x1; // lower 32 bits of mantissa + const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa + const uint64_t exp = static_cast<uint64_t>(1023); + const uint64_t val = (exp << 52) | man; + // Assumes that endian-ness is same for double and uint64. + double result; + memcpy(&result, &val, sizeof(val)); + return result - 1.0; +} + +} // namespace random +} // namespace tensorflow +} + +#endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h new file mode 100644 index 000000000..7dc51fe94 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomOp.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_OP_H__ +#define __NNFW_CKER_HELPER_RANDOM_OP_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/operation/Helper/RandomDistributions.h" + +namespace nnfw +{ +namespace cker +{ + +namespace functor +{ + +template <typename Device, class Distribution> struct FillPhiloxRandom; + +typedef Eigen::ThreadPoolDevice CPUDevice; +// Declares the partially CPU-specialized functor struct. +// +// NOTE: Due to inlining done by the compiler, you may need to add +// explicit instantiation of the functor in random_op.cc. See example +// functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>. +template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution> +{ + void operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, + int64_t size, Distribution dist); +}; + +} // namespace functor +} // namespace tensorflow +} +#endif // __NNFW_CKER_HELPER_RANDOM_OP_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h new file mode 100644 index 000000000..85d267723 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ +#define __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ + +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <cmath> +#include <memory> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" + +#include "cker/operation/Helper/PhiloxRandom.h" +#include "cker/operation/Helper/RandomOp.h" +#include "cker/operation/Helper/RandomDistributions.h" + +#if EIGEN_COMP_GNUC && __cplusplus > 199711L +#define DISABLE_FLOAT_EQUALITY_WARNING \ + _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") +#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop") +#else +#define DISABLE_FLOAT_EQUALITY_WARNING +#define ENABLE_FLOAT_EQUALITY_WARNING +#endif + +namespace nnfw +{ +namespace cker +{ + +typedef Eigen::ThreadPoolDevice CPUDevice; + +namespace functor +{ +using random::PhiloxRandom; +using random::SingleSampleAdapter; + +// The default implementation of the functor, which should never be invoked +// But we still need to provide implementation for now for the linker to work, +// since we do not support all the distributions yet. +template <typename Device, class Distribution> struct FillPhiloxRandom +{ + typedef typename Distribution::ResultElementType T; + void operator()() {} +}; + +// A class to fill a specified range of random groups +template <class Distribution, bool VariableSamplesPerOutput> struct FillPhiloxRandomTask; + +// Specialization for distribution that takes a fixed number of samples for +// each output. +template <class Distribution> struct FillPhiloxRandomTask<Distribution, false> +{ + typedef typename Distribution::ResultElementType T; + static void Run(random::PhiloxRandom gen, T *data, int64_t size, Distribution dist) + { + const int kGroupSize = Distribution::kResultElementCount; + gen.Skip(0); + int64_t offset = 0; + + // First fill all the full-size groups + int64_t limit_group_full = size / kGroupSize; + for (int64_t index = 0; index < limit_group_full; ++index) + { + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + int64_t remaining_size = size - limit_group_full * kGroupSize; + + // If there are any remaining elements that need to be filled, process them + if (remaining_size > 0) + { + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Specialization for distribution that takes a variable number of samples for +// each output. This will be slower due to the generality. +template <class Distribution> struct FillPhiloxRandomTask<Distribution, true> +{ + typedef typename Distribution::ResultElementType T; + static constexpr int64_t kReservedSamplesPerOutput = 256; + + static void Run(random::PhiloxRandom base_gen, T *data, int64_t size, Distribution dist) + { + const int kGroupSize = Distribution::kResultElementCount; + static const int kGeneratorSkipPerOutputGroup = + kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; + + int64_t offset = 0; + + // First fill all the full-size groups + int64_t limit_group_full = size / kGroupSize; + int64_t group_index; + for (group_index = 0; group_index < limit_group_full; ++group_index) + { + // Reset the generator to the beginning of the output group region + // This is necessary if we want the results to be independent of order + // of work + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + int64_t remaining_size = size - limit_group_full * kGroupSize; + // If there are any remaining elements that need to be filled, process them + if (remaining_size > 0) + { + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Partial specialization for CPU to fill the entire region with randoms +// It splits the work into several tasks and run them in parallel +template <class Distribution> +void FillPhiloxRandom<CPUDevice, Distribution>:: +operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, int64_t size, + Distribution dist) +{ + FillPhiloxRandomTask<Distribution, Distribution::kVariableSamplesPerOutput>::Run(gen, data, size, + dist); +} + +} // namespace functor + +} // end namespace tensorflow +} + +#endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h new file mode 100644 index 000000000..e6ac008a5 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/Tensor.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_TENSOR_H__ +#define __NNFW_CKER_HELPER_TENSOR_H__ + +#include "cker/Shape.h" +#include "cker/eigen/EigenSupport.h" + +namespace nnfw +{ +namespace cker +{ +template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> struct TTypes +{ + // Rank-<NDIMS> tensor of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> + Tensor; + typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, + Eigen::Aligned> + ConstTensor; + + // Unaligned Rank-<NDIMS> tensor of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor; + typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>> + UnalignedConstTensor; + + typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned> + Tensor32Bit; + + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, + Eigen::Aligned> + Scalar; + typedef Eigen::TensorMap< + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstScalar; + + // Unaligned Scalar tensor of scalar type T. + typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> + UnalignedScalar; + typedef Eigen::TensorMap< + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> + UnalignedConstScalar; + + // Rank-1 tensor (vector) of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstFlat; + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstVec; + + // Unaligned Rank-1 tensor (vector) of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> + UnalignedConstFlat; + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec; + + // Rank-2 tensor (matrix) of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstMatrix; + + // Unaligned Rank-2 tensor (matrix) of scalar type T. + typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>> + UnalignedConstMatrix; +}; + +typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32; + +template <typename T> struct InputTensor +{ + Shape shape; + const T *buffer; +}; + +struct Tensor +{ + Shape shape; + void *buffer; + +public: + bool copyFrom(const Tensor &other, const Shape &new_shape) + { + if (other.shape.FlatSize() != new_shape.FlatSize()) + return false; + + this->shape.ReplaceWith(new_shape.DimensionsCount(), new_shape.DimsData()); + this->buffer = other.buffer; + + return true; + } + + template <typename T> T *base() const + { + return buffer == nullptr ? nullptr : reinterpret_cast<T *>(buffer); + } + + template <typename T, size_t NDIMS> + typename TTypes<T, NDIMS>::Tensor shaped(const std::vector<int32_t> &new_sizes) + { + Eigen::array<Eigen::DenseIndex, NDIMS> dims; + for (size_t d = 0; d < NDIMS; d++) + { + dims[d] = new_sizes[d]; + } + return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims); + } + + template <typename T> typename TTypes<T>::Flat flat() { return shaped<T, 1>({shape.FlatSize()}); } + + template <typename T, size_t NDIMS> + typename TTypes<T, NDIMS>::ConstTensor shaped(const std::vector<int32_t> new_sizes) const + { + Eigen::array<Eigen::DenseIndex, NDIMS> dims; + for (size_t d = 0; d < NDIMS; d++) + { + dims[d] = new_sizes[d]; + } + return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims); + } + + // Create Eigen Tensor with current shape + template <typename T, size_t NDIMS> typename TTypes<T, NDIMS>::Tensor shaped() const + { + Eigen::array<Eigen::DenseIndex, NDIMS> dims; + for (size_t d = 0; d < NDIMS; d++) + { + dims[d] = shape.Dims(d); + } + return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims); + } + + template <typename T> typename TTypes<T>::ConstFlat flat() const + { + return shaped<T, 1>({shape.FlatSize()}); + } + + template <typename T> typename TTypes<T>::ConstScalar scalar() const + { + return typename TTypes<T>::ConstScalar(base<T>()); + } +}; // Tensor + +template <typename DSizes> Eigen::DSizes<Index32, DSizes::count> To32BitDims(const DSizes &in) +{ + Eigen::DSizes<Index32, DSizes::count> out; + for (int i = 0; i < DSizes::count; ++i) + { + out[i] = in[i]; + } + return out; +} + +template <typename TensorType> +typename TTypes<typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit +To32Bit(TensorType in) +{ + typedef typename TTypes<typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit RetType; + return RetType(in.data(), To32BitDims(in.dimensions())); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_HELPER_TENSOR_H__ diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h new file mode 100644 index 000000000..6445e8a2b --- /dev/null +++ b/compute/cker/include/cker/operation/InstanceNorm.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_INSTANCE_NORM_H__ +#define __NNFW_CKER_INSTANCE_NORM_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void InstanceNorm(const InstanceNormParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &gamma_shape, const float *gamma_data, + const Shape &beta_shape, const float *beta_data, const Shape &output_shape, + float *output_data) +{ + const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1); + const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2); + const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3); + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + UNUSED_RELEASE(gamma_shape); + UNUSED_RELEASE(beta_shape); + assert(output_activation_min <= output_activation_max); + + for (int32_t batch = 0; batch < batches; batch++) + { + for (int32_t channel = 0; channel < channels; channel++) + { + double sum = 0.0f; + double square_sum = 0.0f; + int32_t size = heights * widths; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_val = input_data[Offset(input_shape, batch, height, width, channel)]; + sum += input_val; + square_sum += (input_val * input_val); + } + } + + double mean = sum / size; + double var = square_sum / size - mean * mean; + + double gamma = gamma_data[channel]; + double beta = beta_data[channel]; + + double a = gamma / (std::sqrt(var + params.epsilon)); + double b = -mean * a + beta; + + for (int32_t height = 0; height < heights; height++) + { + for (int32_t width = 0; width < widths; width++) + { + double input_value = input_data[Offset(output_shape, batch, height, width, channel)]; + double output_value = input_value * a + b; + output_data[Offset(output_shape, batch, height, width, channel)] = + ActivationFunctionWithMinMax((float)output_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_INSTANCE_NORM_H__ diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h new file mode 100644 index 000000000..a0075c3d0 --- /dev/null +++ b/compute/cker/include/cker/operation/L2Normalize.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_L2NORMALIZE_H__ +#define __NNFW_CKER_L2NORMALIZE_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +void L2NormalizeFloat32(const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + float epsilon = 1e-6; + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + for (int i = 0; i < outer_size; ++i) + { + float squared_l2_norm = 0; + for (int c = 0; c < depth; ++c) + { + const float val = input_data[c]; + squared_l2_norm += val * val; + } + float l2_norm = std::sqrt(squared_l2_norm); + l2_norm = std::max(l2_norm, epsilon); + for (int c = 0; c < depth; ++c) + { + *output_data = *input_data / l2_norm; + ++output_data; + ++input_data; + } + } +} + +void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32_t input_zero_point = params.input_zero_point; + + for (int i = 0; i < outer_size; ++i) + { + int32_t square_l2_norm = 0; + for (int c = 0; c < depth; c++) + { + // Note that input_data advances by depth in the second pass below. + int32_t diff = input_data[c] - input_zero_point; + square_l2_norm += diff * diff; + } + int32_t inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift); + for (int c = 0; c < depth; c++) + { + int32_t diff = *input_data - input_zero_point; + int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32_t unclamped_output_val = 128 + rescaled_diff; + int32_t output_val = std::min(static_cast<int32_t>(255), + std::max(static_cast<int32_t>(0), unclamped_output_val)); + *output_data = static_cast<uint8_t>(output_val); + ++input_data; + ++output_data; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_L2NORMALIZE_H__ diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h new file mode 100644 index 000000000..27beaaead --- /dev/null +++ b/compute/cker/include/cker/operation/LSTM.h @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ +#define __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ + +#include "cker/TensorUtils.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +// LINT.IfChange +// Calculates a single LSTM gate. +// +// Implements the following formula: (* is matrix multiply) +// gate = activate(W_input * input + W_aux * aux_input + +// W_peephole * cell + W_recurrent * prev_output + bias) +// with layer norm: +// gate = activate(W_norm * normalize(...) + bias) // not adding bias inside +// +// Activation is sigmoid except for the "cell" gate (configurable, usually tanh) +// +// Parameters: +// Input vectors (to LSTM): | Size: | Optional? +// input | n_input | +// aux_input | n_aux_input | y (bidir LSTM) +// Input vectors (persistent states): +// output_state | n_output | +// cell_state | n_cell | +// 'Constant' inputs: +// input_to_gate_weights | n_cell * n_input | +// aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM) +// recurrent_to_gate_weights | n_cell * n_output | +// cell_to_gate_weights | n_cell | y (peephole) +// gate_bias | n_cell | +// layer_norm_coefficients | n_cell | y (layer norm) +// Output vector: +// gate | n_cell | +// Scalar parameters: +// n_batch - batch size / number of vectors +// n_input, n_aux_input, n_output, n_cell - size of vectors. +// activation - activation to use. +// is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero. +// use_layer_norm - if doing layer norm LSTM. +inline void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights, + const float *aux_input, const float *aux_input_to_gate_weights, + const float *output_state, + const float *recurrent_to_gate_weights, const float *cell_state, + const float *cell_to_gate_weights, + const float *layer_norm_coefficients, const float *gate_bias, + const int n_batch, const int n_input, const int n_aux_input, + const int n_output, const int n_cell, + const FusedActivationFunctionType activation, float *gate, + const bool is_input_all_zeros, const bool is_aux_input_all_zeros) +{ + const bool use_peephole = (cell_to_gate_weights != nullptr); + const bool use_layer_norm = (layer_norm_coefficients != nullptr); + + // Initialize scratch buffers with bias for regular lstm or initialize with + // zero for layer norm lstm. + if (use_layer_norm) + { + std::fill_n(gate, n_cell * n_batch, 0.0f); + } + else + { + VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate); + } + // For each batch and cell: compute input_weight * input. + // Skip if input is all zeros. + if (!is_input_all_zeros) + { + MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch, + gate, /*result_stride=*/1); + } + // For each batch and cell: compute aux_input_weight * aux_input. + // Skip if auxiliary input is not available or all zeros. + if (!is_aux_input_all_zeros) + { + MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input, + n_batch, gate, /*result_stride=*/1); + } + // For each batch and cell: compute recurrent_weight * output_state. + MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state, + n_batch, gate, /*result_stride=*/1); + // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM) + if (use_peephole) + { + VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch, + gate); + } + // Do layer normalization (if layer norm LSTM) + if (use_layer_norm) + { + MeanStddevNormalization(gate, gate, n_cell, n_batch); + VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate); + VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate); + } + // Apply activation + ApplyActivationToVector(gate, n_batch * n_cell, activation, gate); +} + +// Updates the LSTM cell state, used by both float and hybrid LSTM versions. +// +// Implements the following formula: +// cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate) +// +// With CIFG LSTM, input gate is replaced by (1-forget_gate). +// +// Parameters: +// - n_batch, n_cell: sizes of vectors +// - cell_state: input/output vector, size n_batch*n_cell +// - input_gate: input vector, size n_batch*n_cell. +// - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG +// - cell_gate: input vector, size n_batch*n_cell. +// - use_cifg: use 1-forget_gate instead of input_gate. +// - clip: if > 0, clip the resulting cell state to [-clip, +clip]. +void UpdateLstmCellFloat(int n_batch, int n_cell, float *cell_state, const float *input_gate, + float *forget_gate, const float *cell_gate, bool use_cifg, float clip) +{ + // Define variable for 4th argument to avoid warning + // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2 + const float *cwise_product_rhs = cell_state; + VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state); + + if (use_cifg) + { + // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as + // scratch, as input_gate array is not allocated in this case. (Be careful + // not to write to the scratch before reading the forget gate data.) + float *scratch = forget_gate; + Sub1Vector(forget_gate, n_batch * n_cell, scratch); + VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state); + } + else + { + VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state); + } + if (clip > 0.0f) + { + CwiseClipping(cell_state, n_batch * n_cell, clip); + } +} + +// Calculates the output state tensor of an LSTM step. +// +// Implements the following formula: +// output_no_projection = output_gate .* activate(cell_state) +// (elementwise vector product) +// If no projection is used: +// output = output_state = output_no_projection +// With projection: +// output = output_state = clip(W*output_no_projection + bias) +// +// Output might not have a different 'stride' than n_batch, so we need to copy. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - projection_weights, projection_weights_scale, projection_bias: +// constant inputs, describing projection matrix and bias. +// - proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - scratch: scratch area, size n_batch*n_cell. +void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float *cell_state, + const float *output_gate, FusedActivationFunctionType activation, + const float *projection_weights, const float *projection_bias, + const float proj_clip, float *output_state, float *scratch) +{ + ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch); + + // Define variable for 4th argument to avoid warning + // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2 + const float *cwise_product_rhs = scratch; + VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch); + + const bool use_projection = (projection_weights != nullptr); + const bool use_projection_bias = (projection_bias != nullptr); + + if (use_projection) + { + if (use_projection_bias) + { + VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state); + } + else + { + std::fill_n(output_state, n_batch * n_output, 0.0f); + } + MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch, + output_state, /*result_stride=*/1); + if (proj_clip > 0.0f) + { + CwiseClipping(output_state, n_batch * n_output, proj_clip); + } + } + else + { + std::copy_n(scratch, n_batch * n_output, output_state); + } +} + +// Performs an LSTM batch inference step for input specified by input_ptr. +// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and +// biases (*_bias_ptr), and buffers (*_scratch), along with additional +// parameters: +// - params: various LSTM params including activation, clipping, etc., +// - n_batch: size of batch, +// - n_cell: number of cells (or units), +// - n_input: the input size, +// - n_aux_input: the auxiliary input size. +// - n_output: the output size. +// - output_batch_leading_dim: the leading dimension of the output buffer. +// +// Input of size 'n_batch * n_input': +// input_ptr +// Input of size 'n_batch * n_aux_input': +// aux_input_ptr - optional (can be nullptr) +// +// LSTM weights: +// Input weights of size 'n_cell * n_input': +// input_to_input_weights - optional +// input_to_forget_weights +// input_to_cell_weights +// input_to_output_weights +// Auxiliary input weights of size 'n_cell * n_aux_input': +// aux_input_to_input_weights - optional +// aux_input_to_forget_weights - optional +// aux_input_to_cell_weights - optional +// aux_input_to_output_weights - optional +// Recurrent weights of size 'n_cell * n_output': +// recurrent_to_input_weights - optional +// recurrent_to_forget_weights +// recurrent_to_cell_weights +// recurrent_to_input_weights +// Peephole weights of size 'n_cell', representing diagonal matrices. +// cell_to_input_weights - optional +// cell_to_cell_weights - optional +// cell_to_output_weights - optional +// Projection weights of size 'n_output * n_cell' +// projection_weights_ptr - optional +// Gate biases of size 'n_cell': +// input_gate_bias_ptr - optional +// forget_gate_bias_ptr +// cell_gate_bias_ptr +// output_gate_bias_ptr +// +// Layer norm coefficients of size 'n_cell', representing diagonal matrices. +// input_layer_norm_coefficients_ptr - optional +// forget_layer_norm_coefficients_ptr - optional +// cell_layer_norm_coefficients_ptr - optional +// output_layer_norm_coefficients_ptr - optional +// +// The pointers to the cell and output state and the output are updated. +// +// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned +// in batch_major order, and each step processes batch_size many inputs from +// input_ptr, and updates batch_size many cell and output states. +// +// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the +// output tensor, and in most cases will be equal to n_output. It is usually not +// when we want to store the LSTM output into a slice of the output tensor, e.g. +// for bidirectional LSTMs with merge_outputs. In this case, the batched +// operations cannot be used since they assume that the batched outputs are +// contiguous, and we manually loop over the batched outputs. +// LINT.IfChange +inline void LstmStepFloat( + const float *input_ptr, const float *input_to_input_weights_ptr, + const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr, + const float *input_to_output_weights_ptr, const float *aux_input_ptr, + const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr, + const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr, + const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr, + const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr, + const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr, + const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr, + const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr, + const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr, + const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr, + const float *output_gate_bias_ptr, const float *projection_weights_ptr, + const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, + int n_input, int n_aux_input, int n_output, int output_batch_leading_dim, + float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1, + float *scratch2, float *scratch3, float *output_ptr) +{ + // Since we have already checked that weights are all there or none, we can + // check the existence of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + + // Make named scratch buffers. + float *input_gate_scratch = scratch0; + float *forget_gate_scratch = scratch1; + float *cell_gate_scratch = scratch2; + float *output_gate_scratch = scratch3; + + // Check if inputs are all zeros so we can skip some computations. + const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input); + const bool is_aux_input_all_zeros = + (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input)); + if (!use_cifg) + { + // Calculate the input gate. (If not CIFG.) + CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr, + aux_input_to_input_weights_ptr, output_state_ptr, + recurrent_to_input_weights_ptr, cell_state_ptr, + cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr, + input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + } + // Calculate the forget gate. + CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr, + aux_input_to_forget_weights_ptr, output_state_ptr, + recurrent_to_forget_weights_ptr, cell_state_ptr, + cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr, + forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + // Calculate the cell update gate. + CalculateLstmGateFloat( + input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr, + output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr, + /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, + n_batch, n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch, + is_input_all_zeros, is_aux_input_all_zeros); + // Update the cell state. + UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch, + cell_gate_scratch, use_cifg, params->cell_clip); + // Calculate output gate. + CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr, + aux_input_to_output_weights_ptr, output_state_ptr, + recurrent_to_output_weights_ptr, cell_state_ptr, + cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr, + output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + // Update the output state. + CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch, + params->activation, projection_weights_ptr, projection_bias_ptr, + params->proj_clip, output_state_ptr, scratch2); + // Copy output state to the output. Note that the output's rows may not be + // contiguous (output_batch_leading_dim != n_output). + for (int b = 0; b < n_batch; b++) + { + std::copy_n(output_state_ptr + b * n_output, n_output, + output_ptr + b * output_batch_leading_dim); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h new file mode 100644 index 000000000..326a44f0c --- /dev/null +++ b/compute/cker/include/cker/operation/LogSoftMax.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGSOFTMAX_H__ +#define __NNFW_CKER_LOGSOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" +#include "cker/eigen/Utils.h" + +#include <Eigen/Core> +#include <fixedpoint/fixedpoint.h> +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void LogSoftmax(const SoftmaxParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data) +{ + const int rank = input_shape.DimensionsCount(); + const int axis = (params.axis < 0) ? params.axis + rank : params.axis; + const double beta = params.beta; + const int depth = MatchingDim(input_shape, axis, output_shape, axis); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int inner_size = 1; + for (int i = axis + 1; i < rank; ++i) + { + inner_size *= input_shape.Dims(i); + } + + for (int i = 0; i < outer_size; ++i) + { + for (int j = 0; j < inner_size; ++j) + { + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) + { + max = std::max(max, input_data[(i * depth + c) * inner_size]); + } + + float sum = 0.f; + for (int c = 0; c < depth; ++c) + { + sum += std::exp((input_data[(i * depth + c) * inner_size + j] - max) * beta); + } + + const float log_sum = std::log(sum); + for (int c = 0; c < depth; ++c) + { + output_data[(i * depth + c) * inner_size + j] = + (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum; + } + } + } +} + +inline void LogSoftmax(const SoftmaxParams ¶ms, float input_scale, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + const int rank = input_shape.DimensionsCount(); + const int axis = (params.axis < 0) ? params.axis + rank : params.axis; + const double beta = params.beta; + const int depth = MatchingDim(input_shape, axis, output_shape, axis); + + const int32_t clamp_max = std::numeric_limits<uint8_t>::max(); + const int32_t clamp_min = std::numeric_limits<uint8_t>::min(); + + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int inner_size = 1; + for (int i = axis + 1; i < rank; ++i) + { + inner_size *= input_shape.Dims(i); + } + + for (int i = 0; i < outer_size; ++i) + { + for (int j = 0; j < inner_size; ++j) + { + uint8_t max_val = std::numeric_limits<uint8_t>::min(); + for (int c = 0; c < depth; ++c) + { + max_val = std::max(max_val, input_data[(i * depth + c) * inner_size]); + } + + float sum_exp = 0.0f; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const float *table_offset = ¶ms.table[max_uint8 - max_val]; + for (int c = 0; c < depth; ++c) + { + sum_exp += table_offset[input_data[(i * depth + c) * inner_size]]; + } + const float log_sum_exp = std::log(sum_exp); + + const float scale = input_scale / params.scale; + const float precomputed = (input_scale * max_val * beta + log_sum_exp) / params.scale; + for (int c = 0; c < depth; ++c) + { + const float log_prob = + scale * input_data[(i * depth + c) * inner_size] * beta - precomputed; + const int32_t prob_quantized = std::rint(log_prob) + params.zero_point; + output_data[(i * depth + c) * inner_size] = + static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGSOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/LogicalNot.h b/compute/cker/include/cker/operation/LogicalNot.h new file mode 100644 index 000000000..5e8d38b45 --- /dev/null +++ b/compute/cker/include/cker/operation/LogicalNot.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGICALNOT_H__ +#define __NNFW_CKER_LOGICALNOT_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +inline void LogicalNot(const Shape &input_shape, const bool *input_data, const Shape &output_shape, + bool *output_data) +{ + const int size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = !input_data[i]; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGICALNOT_H__ diff --git a/compute/cker/include/cker/operation/LogicalOr.h b/compute/cker/include/cker/operation/LogicalOr.h new file mode 100644 index 000000000..ec07c23d9 --- /dev/null +++ b/compute/cker/include/cker/operation/LogicalOr.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGICAL_OR_H__ +#define __NNFW_CKER_LOGICAL_OR_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void LogicalOrBroadcast(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = in1_val || in2_val; + } + } + } + } +} + +template <typename T> +inline void LogicalOrElementwise(const Shape &shape, const T *input1_data, const T *input2_data, + T *output_data) +{ + + int num_elements = shape.FlatSize(); + + for (int t = 0; t < num_elements; t++) + { + output_data[t] = input1_data[t] || input2_data[t]; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGICAL_OR_H___ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h new file mode 100644 index 000000000..3d3e59e55 --- /dev/null +++ b/compute/cker/include/cker/operation/Logistic.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGISTIC_H__ +#define __NNFW_CKER_LOGISTIC_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +#include <cmath> +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); + output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>()); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGISTIC_H__ diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h new file mode 100644 index 000000000..5674ff3ef --- /dev/null +++ b/compute/cker/include/cker/operation/MatrixBandPart.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_MATRIX_BAND_PART_H__ +#define __NNFW_CKER_MATRIX_BAND_PART_H__ + +#include "cker/Shape.h" + +#include <algorithm> + +namespace nnfw +{ +namespace cker +{ +template <typename T> +void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data) +{ + auto last_dim = input_shape.DimensionsCount() - 1; + + T batch_num = 1; + for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++) + { + batch_num *= input_shape.Dims(dim); + } + + const T row_num = input_shape.Dims(last_dim - 1); + const T col_num = input_shape.Dims(last_dim); + + if (!(num_lower_diags <= row_num)) + throw std::runtime_error( + "MatrixBandPart : num_lower must be negative or less or equal to number of rows"); + + if (!(num_upper_diags <= col_num)) + throw std::runtime_error( + "MatrixBandPart : num_upper must be negative or less or equal to number of columns"); + + std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init + + // reference code, without multithreading + for (T batch = 0; batch < batch_num; ++batch) + { + for (T row = 0; row < row_num; ++row) + { + auto output = output_data + (batch * row_num * col_num + row * col_num); + auto input = input_data + (batch * row_num * col_num + row * col_num); + + const T band_start = + num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags)); + const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num), + row + num_upper_diags + 1); + + for (T band_idx = band_start; band_idx < band_end; band_idx++) + { + output[band_idx] = input[band_idx]; + } + } + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_MATRIX_BAND_PART_H__ diff --git a/compute/cker/include/cker/operation/MaxMin.h b/compute/cker/include/cker/operation/MaxMin.h new file mode 100644 index 000000000..691b3b0b3 --- /dev/null +++ b/compute/cker/include/cker/operation/MaxMin.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_MAXMIN_H__ +#define __NNFW_CKER_MAXMIN_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +struct MaximumOp +{ + template <typename data_type> static data_type op(data_type el1, data_type el2) + { + return el1 > el2 ? el1 : el2; + } +}; + +struct MinimumOp +{ + template <typename data_type> static data_type op(data_type el1, data_type el2) + { + return el1 < el2 ? el1 : el2; + } +}; + +template <typename T, typename Op> +inline void +MaximumMinimumBroadcast4DSlow(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data, Op op) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = op(in1_val, in2_val); + } + } + } + } +} + +template <typename T> +inline void Max(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape, + input2_data, unextended_output_shape, output_data, + MaximumOp::template op<T>); +} + +template <typename T> +inline void Min(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape, + input2_data, unextended_output_shape, output_data, + MinimumOp::template op<T>); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_MAXMIN_H__ diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h new file mode 100644 index 000000000..ea3fcaca6 --- /dev/null +++ b/compute/cker/include/cker/operation/MaxPool.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_MAX_POOL_H__ +#define __NNFW_CKER_MAX_POOL_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" +#include "cker/eigen/Utils.h" + +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +template <typename T> void MaxPool(const PoolParams &, const Shape &, const T *, const Shape &, T *) +{ + static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value, + "cker::MaxPool : This function supports only integer or floating point"); + throw std::runtime_error("cker::MaxPool : Unsupported data type"); +} + +template <> +void MaxPool<float>(const PoolParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Prefill the output to minimum representable float value + out_mat.setConstant(std::numeric_limits<float>::lowest()); + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < input_height; ++h) + { + for (int w = 0; w < input_width; ++w) + { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + params.padding_values.height; + int wpad = w + params.padding_values.width; + int h_start = + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) + { + for (int pw = w_start; pw < w_end; ++pw) + { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) = + out_mat.col(out_offset) + .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); + } + } + } + } + } + const int flat_size = output_shape.FlatSize(); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min, + params.float_activation_max); + } +} + +template <> +void MaxPool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + + // Here, and in other pooling ops, in order to maintain locality of reference, + // to minimize some recalculations, and to load into NEON vector registers, we + // use an inner loop down the depth. Since depths can be large and hence we + // would need arbitrarily large temporary storage, we divide the work up into + // depth tranches just within the batch loop. + static constexpr int kPoolingAccTrancheSize = 256; + + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + uint8_t acc[kPoolingAccTrancheSize]; + for (int batch = 0; batch < batches; ++batch) + { + // We proceed through the depth in tranches (see comment above). The + // depth_base is the depth at the beginning of the tranche. The + // tranche_depth is the depth dimension of the tranche. + for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize) + { + const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + memset(acc, 0, tranche_depth * sizeof(acc[0])); + const uint8_t *input_ptr = + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + for (int fy = filter_y_start; fy < filter_y_end; fy++) + { + const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); + for (int fx = filter_x_start; fx < filter_x_end; fx++) + { + const uint8_t *input_channel_ptr = input_row_ptr; + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + uint8x16_t acc_reg = vld1q_u8(acc + channel); + uint8x16_t input_reg = vld1q_u8(input_channel_ptr); + input_channel_ptr += 16; + acc_reg = vmaxq_u8(acc_reg, input_reg); + vst1q_u8(acc + channel, acc_reg); + } + + for (; channel <= tranche_depth - 8; channel += 8) + { + uint8x8_t acc_reg = vld1_u8(acc + channel); + uint8x8_t input_reg = vld1_u8(input_channel_ptr); + input_channel_ptr += 8; + acc_reg = vmax_u8(acc_reg, input_reg); + vst1_u8(acc + channel, acc_reg); + } +#endif + for (; channel < tranche_depth; ++channel) + { + acc[channel] = std::max(acc[channel], *input_channel_ptr++); + } + input_row_ptr += depth; + } + } + uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base); + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + uint8x16_t a = vld1q_u8(acc + channel); + a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max)); + a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min)); + vst1q_u8(output_ptr + channel, a); + } + for (; channel <= tranche_depth - 8; channel += 8) + { + uint8x8_t a = vld1_u8(acc + channel); + a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max)); + a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min)); + vst1_u8(output_ptr + channel, a); + } +#endif + for (; channel < tranche_depth; ++channel) + { + uint8_t a = acc[channel]; + a = std::max<uint8_t>(a, params.quantized_activation_min); + a = std::min<uint8_t>(a, params.quantized_activation_max); + output_ptr[channel] = static_cast<uint8_t>(a); + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_MAX_POOL_H__ diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h new file mode 100644 index 000000000..c0dbc6df5 --- /dev/null +++ b/compute/cker/include/cker/operation/OneHot.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ONEHOT_H__ +#define __NNFW_CKER_ONEHOT_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T, typename TI> +void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t axis, + const Shape &indices_shape, const TI *indices_data, const Shape &, T *output_data) +{ + if (axis == -1) + axis = indices_shape.DimensionsCount(); + + // prefix_dim_size == # of elements before the axis + // depth == # of elements per axis + // suffix_dim_size == # of elements after the axis + int prefix_dim_size = 1; + for (int i = 0; i < axis; ++i) + { + prefix_dim_size *= indices_shape.Dims(i); + } + const int suffix_dim_size = indices_shape.FlatSize() / prefix_dim_size; + + // View the indices as a matrix of size: + // prefix_dim_size x suffix_dim_size + // View the output as a matrix of size: + // prefix_dim_size x depth x suffix_dim_size + // Then the output is: + // output(i, j, k) == (indices(i, k) == j) ? on : off + for (int i = 0; i < prefix_dim_size; ++i) + { + for (int j = 0; j < depth; ++j) + { + for (int k = 0; k < suffix_dim_size; ++k, ++output_data) + { + *output_data = + static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ONEHOT_H__ diff --git a/compute/cker/include/cker/operation/Pack.h b/compute/cker/include/cker/operation/Pack.h new file mode 100644 index 000000000..fd865047d --- /dev/null +++ b/compute/cker/include/cker/operation/Pack.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_PACK_H__ +#define __NNFW_CKER_PACK_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +inline void Pack(const PackParams ¶ms, const Scalar *const *input_data, + const Shape &output_shape, Scalar *output_data) +{ + const int dimensions = output_shape.DimensionsCount(); + int axis = params.axis; + int inputs_count = params.inputs_count; + + int outer_size = 1; + for (int i = 0; i < axis; i++) + { + outer_size *= output_shape.Dims(i); + } + int copy_size = 1; + for (int i = params.axis + 1; i < dimensions; i++) + { + copy_size *= output_shape.Dims(i); + } + + for (int i = 0; i < inputs_count; ++i) + { + for (int k = 0; k < outer_size; k++) + { + const Scalar *input_ptr = input_data[i] + copy_size * k; + int loc = k * inputs_count * copy_size + i * copy_size; + memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar)); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_PACK_H__ diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h new file mode 100644 index 000000000..4a2732d82 --- /dev/null +++ b/compute/cker/include/cker/operation/Pad.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_PAD_H__ +#define __NNFW_CKER_PAD_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <stdexcept> +#include <iostream> +namespace nnfw +{ +namespace cker +{ +template <typename T> +inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, + const T *input_data, const Shape &output_shape, T *output_data, + const T *constant_value_data) +{ + // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` + // TODO: come up with more subtle solution that uses subtensors like arm compute + // TODO: Check if it works for all layouts + + using PaddingInfo = std::pair<int32_t, int32_t>; + /** List of padding information */ + using PaddingList = std::vector<PaddingInfo>; + + const T constant_value = constant_value_data ? *constant_value_data : 0; + assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); + + PaddingList padding_list(pad_rank); + for (int32_t n = 0; n < pad_rank; ++n) + { + const int32_t *from = padding_data + (n * 2); + padding_list[n] = {from[0], from[1]}; + } + for (int32_t i = 0; i < pad_rank; ++i) + { + assert(output_shape.Dims(i) == + input_shape.Dims(i) + padding_list[i].first + padding_list[i].second); + } + /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker + functions: + 1. to prevent access violation in padding_list; + 2. handling as 4d is slower than as 2d/3d. + */ + switch (pad_rank) + { + case 0: + case 1: + { + const int32_t in_row_len = input_shape.Dims(0); + std::fill_n(output_data, padding_list[0].first, constant_value); + std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T)); + std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, + constant_value); + break; + } + case 2: // HW + { + const int32_t in_row_len = input_shape.Dims(1); + const int32_t out_row_size = output_shape.Dims(1); + + // prepend padding rows + std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j) + { + auto out_offset = i * out_row_size; + const auto in_offset = j * in_row_len; + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[1].first, constant_value); + + out_offset += padding_list[1].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[1].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size, + constant_value); + break; + } + case 3: // HWC + { + const int32_t in_row_len = input_shape.Dims(2); + const int32_t out_row_size = output_shape.Dims(2); + const auto plain_size = out_row_size * output_shape.Dims(1); + + // prepend padding plains + std::fill_n(output_data, padding_list[0].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp) + { + const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp) + { + auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2); + const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2); + + // prepend padding values + std::fill_n(output_data + out_offset, padding_list[2].first, constant_value); + + out_offset += padding_list[2].first; + + // copy a row of input data + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_offset, padding_list[2].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[1].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size, + constant_value); + break; + } + case 4: + { + auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t { + return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3); + }; + const int32_t in_row_len = input_shape.Dims(3); + const int32_t out_row_size = output_shape.Dims(3); + const auto plain_size = out_row_size * output_shape.Dims(2); + const auto parallelepiped_size = plain_size * output_shape.Dims(1); + + // prepend padding parallelepipeds + std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value); + + const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first; + for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp) + { + const auto out_h_offset = get_offset(output_shape, i, 0, 0); + // prepend padding plains + std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value); + + const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first; + for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp) + { + const auto out_w_offset = get_offset(output_shape, i, j, 0); + + // prepend padding rows + std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size, + constant_value); + + const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first; + for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp) + { + auto out_c_offset = get_offset(output_shape, i, j, k); + const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp); + + // prepend padding values + std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value); + + out_c_offset += padding_list[3].first; + + // copy a row of input data + memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T)); + + out_c_offset += in_row_len; + + // append padding values + std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value); + } + + // append padding rows + std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size, + padding_list[2].second * out_row_size, constant_value); + } + + // append padding plains + std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size, + padding_list[1].second * plain_size, constant_value); + } + // append padding parallelepipeds + std::fill_n(output_data + r_b_inp_lim * parallelepiped_size, + padding_list[0].second * parallelepiped_size, constant_value); + break; + } + default: + throw std::runtime_error("Padding for rank > 4 NYI"); + break; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_PAD_H__ diff --git a/compute/cker/include/cker/operation/Pow.h b/compute/cker/include/cker/operation/Pow.h new file mode 100644 index 000000000..1214e0964 --- /dev/null +++ b/compute/cker/include/cker/operation/Pow.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_POW_H__ +#define __NNFW_CKER_POW_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void powImpl(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data) +{ + const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = std::pow(input1_data[i], input2_data[i]); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_POW_H__ diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h new file mode 100644 index 000000000..5c82d111f --- /dev/null +++ b/compute/cker/include/cker/operation/Quantize.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_QUANTIZE_H__ +#define __NNFW_CKER_QUANTIZE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <stdexcept> +#include <iostream> +namespace nnfw +{ +namespace cker +{ +template <typename InputT, typename OutputT> +inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape, + OutputT *output_data, const float output_scale, const int32_t output_offset) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + int min_val = std::numeric_limits<OutputT>::min(); + int max_val = std::numeric_limits<OutputT>::max(); + + for (int i = 0; i < flat_size; i++) + { + int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset; + int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_QUANTIZE_H__ diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h new file mode 100644 index 000000000..5c3a773a2 --- /dev/null +++ b/compute/cker/include/cker/operation/Range.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RANGE_H__ +#define __NNFW_CKER_RANGE_H__ + +#include "cker/Shape.h" + +#include <cmath> +#include <stdexcept> + +namespace nnfw +{ +namespace cker +{ +template <typename T> inline int GetSize(T start, T limit, T delta) +{ + if (!((start > limit && delta < 0) || (start < limit && delta > 0))) + { + throw std::runtime_error("Range: invalid input values"); + } + + int size = (std::is_integral<T>::value + ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta)) + : std::ceil(std::abs((limit - start) / delta))); + return size; +} + +template <typename T> +inline void Range(const T *start_data, const T *limit_data, const T *delta_data, T *output_data) +{ + const T start_value = *start_data; + const T delta_value = *delta_data; + const T limit_value = *limit_data; + + const int num_elements = GetSize<T>(start_value, limit_value, delta_value); + T value = start_value; + + for (int i = 0; i < num_elements; ++i) + { + output_data[i] = value; + value += delta_value; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RANGE_H__ diff --git a/compute/cker/include/cker/operation/ReLU.h b/compute/cker/include/cker/operation/ReLU.h new file mode 100644 index 000000000..2a6cc4a98 --- /dev/null +++ b/compute/cker/include/cker/operation/ReLU.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RELU_H__ +#define __NNFW_CKER_RELU_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +#include <cmath> +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void ReLU(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); + output_map = input_map.cwiseMax(0.0f); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RELU_H__ diff --git a/compute/cker/include/cker/operation/ReLU6.h b/compute/cker/include/cker/operation/ReLU6.h new file mode 100644 index 000000000..20df561dc --- /dev/null +++ b/compute/cker/include/cker/operation/ReLU6.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RELU6_H__ +#define __NNFW_CKER_RELU6_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +#include <cmath> +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void ReLU6(const Shape &input_shape, const float *input_data, float *output_data) +{ + int size = input_shape.FlatSize(); + + for (int i = 0; i < size; ++i) + { + if (input_data[i] <= 0) + { + output_data[i] = 0; + } + else if (input_data[i] > 6.0) + { + output_data[i] = 6.0; + } + else + { + output_data[i] = input_data[i]; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RELU6_H__ diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h new file mode 100644 index 000000000..2b2e8d338 --- /dev/null +++ b/compute/cker/include/cker/operation/Reduce.h @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REDUCE_H__ +#define __NNFW_CKER_REDUCE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ + +// A generic reduce method that can be used for reduce_sum, reduce_mean, etc. +// This method iterates through input data and reduce elements along the +// dimensions given in axis. + +#ifdef USE_NEON +inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape, + float *output_data) +{ + const auto input_dims = input_shape.DimsData(); + const auto input_num_dims = input_shape.DimensionsCount(); + + int input_size = 1; + int reduce_size = 0; + for (int idx = 0; idx < input_num_dims - 1; idx++) + { + input_size *= input_dims[idx]; + } + reduce_size = input_dims[input_num_dims - 1]; + for (int idx = 0; idx < input_size; idx++) + { + int r_idx = 0; + float tmp_data[4] = { + 0, + }; + float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data); + for (; r_idx <= reduce_size - 32; r_idx += 32) + { + float32x4_t a10 = vld1q_f32(input_data + r_idx); + float32x4_t a11 = vld1q_f32(input_data + r_idx + 4); + float32x4_t a12 = vld1q_f32(input_data + r_idx + 8); + float32x4_t a13 = vld1q_f32(input_data + r_idx + 12); + float32x4_t a20 = vld1q_f32(input_data + r_idx + 16); + float32x4_t a21 = vld1q_f32(input_data + r_idx + 20); + float32x4_t a22 = vld1q_f32(input_data + r_idx + 24); + float32x4_t a23 = vld1q_f32(input_data + r_idx + 28); + + float32x4_t x0 = vaddq_f32(a10, a20); + float32x4_t x1 = vaddq_f32(a11, a21); + float32x4_t x2 = vaddq_f32(a12, a22); + float32x4_t x3 = vaddq_f32(a13, a23); + + float32x4_t y0 = vaddq_f32(x0, x1); + float32x4_t y1 = vaddq_f32(x2, x3); + float32x4_t y2 = vaddq_f32(y0, y1); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2); + } + for (; r_idx <= reduce_size - 8; r_idx += 8) + { + float32x4_t a1 = vld1q_f32(input_data + r_idx); + float32x4_t a2 = vld1q_f32(input_data + r_idx + 4); + float32x4_t x = vaddq_f32(a1, a2); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x); + } + vst1q_f32(tmp_data, tmp_data_32x4); + output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3]; + + for (; r_idx < reduce_size; r_idx++) + { + if (r_idx == 0) + { + output_data[idx] = input_data[idx * reduce_size]; + } + else + { + output_data[idx] += input_data[idx * reduce_size + r_idx]; + } + } + } +} +#endif // NEON + +template <typename In, typename Out> +inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &, + const int *axis, const int num_axis, int *input_iter, + Out reducer(const Out current, const In in), Out *output_data) +{ + const auto input_dims = input_shape.DimsData(); + const auto input_num_dims = input_shape.DimensionsCount(); + + // Reset input iterator. + if (num_axis == 1 && axis[0] == input_num_dims - 1) + { + int input_size = 1; + int reduce_size = 0; + for (int idx = 0; idx < input_num_dims - 1; idx++) + { + input_size *= input_dims[idx]; + } + reduce_size = input_dims[input_num_dims - 1]; + for (int idx = 0; idx < input_size; idx++) + { + for (int r_idx = 0; r_idx < reduce_size; r_idx++) + { + if (r_idx == 0) + { + output_data[idx] = input_data[idx * reduce_size]; + } + else + { + output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]); + } + } + } + return true; + } + + for (int idx = 0; idx < input_num_dims; ++idx) + { + input_iter[idx] = 0; + } + // Iterate through input_data. + do + { + size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); + size_t output_offset = + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]); + } while (NextIndex(input_num_dims, input_dims, input_iter)); + return true; +} + +// This method parses the input 'axis' to remove duplicates and handle negative +// values, and returns a valid 'out_axis' +inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis, + int *out_num_axis) +{ + auto num_axis = axes.size(); + auto axis = axes.data(); + + *out_num_axis = 0; // Just in case. + // Short-circuit axis resolution for scalars; the axis will go unused. + if (num_dims == 0) + { + return true; + } + // o(n^2) is fine since out_num_axis should be really small, mostly <= 4 + for (size_t idx = 0; idx < num_axis; ++idx) + { + // Handle negative index. A positive index 'p_idx' can be represented as a + // negative index 'n_idx' as: n_idx = p_idx-num_dims + // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */ + int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx]; + assert(current >= 0 && current < num_dims); + bool is_dup = false; + for (int j = 0; j < *out_num_axis; ++j) + { + if (out_axis[j] == current) + { + is_dup = true; + break; + } + } + if (!is_dup) + { + out_axis[*out_num_axis] = current; + *out_num_axis += 1; + } + } + return true; +} + +template <typename T> +inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data) +{ + const auto dims = shape.DimsData(); + const auto num_dims = shape.DimensionsCount(); + size_t num_elements = 1; + for (int idx = 0; idx < num_dims; ++idx) + { + size_t current = static_cast<size_t>(dims[idx]); + // Overflow prevention. + if (num_elements > std::numeric_limits<size_t>::max() / current) + { + return false; + } + num_elements *= current; + } + for (size_t idx = 0; idx < num_elements; ++idx) + { + data[idx] = init_value; + } + return true; +} + +class Reduce +{ +public: + Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {} + + void prepare(size_t temp_index_size, size_t resolved_axis_size) + { + if (_prepared) + return; + + // prepare space for temp_index and resolved_axis + if (temp_index_size > kMaxSmallSize) + _temp_index.resize(temp_index_size); + if (resolved_axis_size > kMaxSmallSize) + _resolved_axis.resize(resolved_axis_size); + _prepared = true; + } + + // Computes the generic value (i.e., sum/max/min/prod) of elements across + // dimensions given in axis. It needs to pass in init_value and reducer. + template <typename T> + inline bool ReduceGeneric(const Shape &input_shape, const T *input_data, + const Shape &output_shape, T *output_data, const std::vector<int> &axes, + bool, T init_value, T reducer(const T current, const T in)) + { + // Reset output data. + if (!InitTensorDataForReduce(output_shape, init_value, output_data)) + { + return false; + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis)) + { + return false; + } + + return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(), + num_resolved_axis, temp_index_data(), reducer, output_data); + } + + // Computes the mean of elements across dimensions given in axis. + // It does so in two stages, first calculates the sum of elements along the axis + // then divides it by the number of element in axis for quantized values. + template <typename T, typename U> + inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale, + const Shape &input_shape, T *output_data, + int32_t output_zero_point, float output_scale, + const Shape &output_shape, const std::vector<int> &axes, + bool /*keep_dims*/, U *temp_sum, bool compute_sum, + U reducer(const U current, const T in)) + { + // Reset output data. + size_t num_outputs = 1; + for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx) + { + size_t current = static_cast<size_t>(output_shape.Dims(idx)); + // Overflow prevention. + if (num_outputs > std::numeric_limits<size_t>::max() / current) + { + return false; + } + num_outputs *= current; + } + for (size_t idx = 0; idx < num_outputs; ++idx) + { + output_data[idx] = T(); + temp_sum[idx] = U(); + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis)) + { + return false; + } + + if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(), + num_resolved_axis, temp_index_data(), reducer, temp_sum)) + { + return false; + } + + // Calculate mean by dividing output_data by num of aggregated element. + U num_elements_in_axis = 1; + for (int idx = 0; idx < num_resolved_axis; ++idx) + { + size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx])); + // Overflow prevention. + if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis)) + { + return false; + } + num_elements_in_axis *= current; + } + + if (num_elements_in_axis > 0) + { + const float scale = input_scale / output_scale; + if (compute_sum) + { + // TODO(b/116341117): Eliminate float and do this completely in 8bit. + const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f; + for (size_t idx = 0; idx < num_outputs; ++idx) + { + const U value = + static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; + output_data[idx] = static_cast<T>(value); + } + } + else + { + const float bias = -input_zero_point * scale + 0.5f; + for (size_t idx = 0; idx < num_outputs; ++idx) + { + float float_mean = + static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); + float result = std::min(std::round(float_mean * scale + bias) + output_zero_point, + static_cast<float>(std::numeric_limits<T>::max())); + result = std::max(result, static_cast<float>(std::numeric_limits<T>::min())); + output_data[idx] = static_cast<T>(result); + } + } + } + return true; + } + + inline int32_t *resolved_axis_data(void) + { + return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small; + } + inline int32_t *temp_index_data(void) + { + return _temp_index.size() ? _temp_index.data() : _temp_index_small; + } + +private: + std::vector<int> _temp_index; + std::vector<int> _resolved_axis; + bool _prepared; + static constexpr int kMaxSmallSize = 4; + int _temp_index_small[kMaxSmallSize]; + int _resolved_axis_small[kMaxSmallSize]; +}; + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REDUCE_H__ diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h new file mode 100644 index 000000000..2e4fc6274 --- /dev/null +++ b/compute/cker/include/cker/operation/ReduceMean.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REDUCEMEAN_H__ +#define __NNFW_CKER_REDUCEMEAN_H__ + +#include "cker/Shape.h" +#include "cker/operation/Reduce.h" + +namespace nnfw +{ +namespace cker +{ + +float round_nearest(float value) +{ + if (value < 0) + { + return static_cast<float>(static_cast<int>(value - 0.5f)); + } + else + { + return static_cast<float>(static_cast<int>(value + 0.5f)); + } +} +template <typename Out, typename In> +Out mean_reducer(const Out data1, const In data2, int normalizer) +{ + return data1 + static_cast<Out>(data2) / normalizer; +} + +template <typename In> int sum_reducer(const int data1, const In data2) +{ + return data1 + static_cast<int>(data2); +} + +template <typename In, typename Out> +inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const int *axis, + const int num_axis, int *input_iter, + Out reducer(const Out current, const In in, int normalizer), + Out *output_data) +{ + const auto input_dims = input_shape.DimsData(); + const auto input_num_dims = input_shape.DimensionsCount(); + int normalizer = 1; + // Reset input iterator. + for (int idx = 0; idx < input_num_dims; ++idx) + { + input_iter[idx] = 0; + } + // Compute number of output elements + for (int idx = 0; idx < num_axis; ++idx) + { + normalizer *= input_dims[axis[idx]]; + } + // Iterate through input_data. + do + { + size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); + size_t output_offset = + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + output_data[output_offset] = + reducer(output_data[output_offset], input_data[input_offset], normalizer); + } while (NextIndex(input_num_dims, input_dims, input_iter)); + return true; +} + +template <typename In> +inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape, const int *axis, + const int num_axis, int *input_iter, + int reducer(const int current, const In in), int *temp_sum) +{ + const auto input_dims = input_shape.DimsData(); + const auto input_num_dims = input_shape.DimensionsCount(); + size_t normalizer = 1; + // Reset input iterator. + for (int idx = 0; idx < input_num_dims; ++idx) + { + input_iter[idx] = 0; + } + // Compute number of output elements + for (int idx = 0; idx < num_axis; ++idx) + { + normalizer *= input_dims[axis[idx]]; + } + // Iterate through input_data. + do + { + size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); + size_t output_offset = + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]); + } while (NextIndex(input_num_dims, input_dims, input_iter)); + return normalizer; +} + +class ReduceMean : public Reduce +{ +public: + ReduceMean() : Reduce(){}; + + template <typename T> + int PrepareforReduce(const Shape &input_shape, const Shape &output_shape, + const std::vector<int> &axes, T *output_data, T init_value) + { + // Reset output data. + if (!InitTensorDataForReduce(output_shape, init_value, output_data)) + { + return -1; + } + const auto input_dims = input_shape.DimsData(); + const int num_dims = input_shape.DimensionsCount(); + int resolved_axis_size = 1; + const auto num_axes = axes.size(); + + for (size_t idx = 0; idx < num_axes; idx++) + { + int current = axes[idx] < 0 ? (axes[idx] + num_dims) : axes[idx]; + assert(current >= 0 && current < num_dims); + resolved_axis_size *= input_dims[current]; + } + + prepare(num_dims, resolved_axis_size); + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis)) + { + return -1; + } + + return num_resolved_axis; + } + + // Computes the generic value (i.e., sum/max/min/prod) of elements across + // dimensions given in axis. It needs to pass in init_value and reducer. + template <typename In, typename Out> + inline bool ReduceOp(const Shape &input_shape, const In *input_data, const Shape &output_shape, + Out *output_data, const std::vector<int> &axes, bool, Out init_value, + Out reducer(const Out current, const Out in, int normalizer)) + { + int num_resolved_axis; + num_resolved_axis = PrepareforReduce(input_shape, output_shape, axes, output_data, init_value); + if (num_resolved_axis == -1) + { + return false; + } + return ReduceMeanImpl<In, Out>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, + temp_index_data(), reducer, output_data); + } + + template <typename In, typename Out> + inline bool ReduceOp(const Shape &input_shape, const In *input_data, float input_scale, + int32_t input_offset, const Shape &output_shape, Out *output_data, + float output_scale, int32_t output_offset, const std::vector<int> &axes, + bool, Out init_value, int reducer(const int current, const In in)) + { + size_t num_outputs = 1; + auto output_dims = output_shape.DimsData(); + + for (size_t idx = 0; idx < static_cast<size_t>(output_shape.DimensionsCount()); idx++) + { + num_outputs *= output_dims[idx]; + } + _temp_sum.resize(num_outputs, 0); + int num_resolved_axis; + num_resolved_axis = PrepareforReduce(input_shape, output_shape, axes, output_data, init_value); + if (num_resolved_axis == -1) + { + return false; + } + + size_t normalizer = + ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, + temp_index_data(), reducer, _temp_sum.data()); + if (num_outputs > 0) + { + float scale = input_scale / output_scale; + float bias = -input_offset * scale; + for (size_t idx = 0; idx < num_outputs; idx++) + { + float float_mean = static_cast<float>(_temp_sum[idx]) / normalizer; + float result = std::min(round_nearest(float_mean * scale + bias + output_offset), + static_cast<float>(std::numeric_limits<Out>::max())); + result = std::max(result, static_cast<float>(std::numeric_limits<Out>::min())); + output_data[idx] = static_cast<Out>(result); + } + } + return false; + } + +private: + std::vector<int> _temp_sum; +}; + +template <typename In, typename Out> +void Mean(const Shape &input_shape, const In *input_data, const Shape &output_shape, + Out *output_data, const std::vector<int> &axes) +{ + UNUSED_RELEASE(output_shape); + assert(input_shape.DimensionsCount() > 0); + ReduceMean m_obj; + m_obj.ReduceOp<In, Out>(input_shape, input_data, output_shape, output_data, axes, true, (Out)0, + mean_reducer); +} + +template <typename In, typename Out> +void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_scale, + int32_t input_offset, const Shape &output_shape, Out *output_data, + float output_scale, int32_t output_offset, const std::vector<int> &axes) +{ + UNUSED_RELEASE(output_shape); + assert(input_shape.DimensionsCount() > 0); + ReduceMean m_obj; + m_obj.ReduceOp<In, Out>(input_shape, input_data, input_scale, input_offset, output_shape, + output_data, output_scale, output_offset, axes, true, (Out)0, + sum_reducer); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REDUCEMEAN_H__ diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h new file mode 100644 index 000000000..7fc1e9123 --- /dev/null +++ b/compute/cker/include/cker/operation/ResizeBilinear.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RESIZEBILINEAR_H__ +#define __NNFW_CKER_RESIZEBILINEAR_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x, + int32_t y, int32_t depth, int32_t batch, + const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int32_t input_width = input_shape.Dims(2); + const int32_t output_width = output_shape.Dims(2); + + const int32_t input_x_offset = (x1 - x0) * depth; + const int32_t input_y_offset = (y1 - y0) * depth * input_width; + const int32_t output_x_offset = depth; + const int32_t output_y_offset = depth * output_width; + + for (int ch = 0; ch < depth; ch++) + { + const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch); + + float x0y0 = input_data[input_offset]; + float x1y0 = input_data[input_offset + input_x_offset]; + float x0y1 = input_data[input_offset + input_y_offset]; + float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; + + // Top left corner. + const int32_t output_offset = Offset(output_shape, batch, y, x, ch); + output_data[output_offset] = x0y0; + + // Top right corner. + output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2; + + // Bottom left corner. + float output = (x0y0 + x0y1) / 2; + output_data[output_offset + output_y_offset] = output; + + // Bottom right corner. + output_data[output_offset + output_x_offset + output_y_offset] = + (output + ((x1y0 + x1y1) / 2)) / 2; + } +} + +inline void ResizeBilinear2x2(int32_t batches, int32_t input_height, int32_t input_width, + int32_t depth, int32_t output_height, int32_t output_width, + const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + for (int b = 0; b < batches; b++) + { + for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) + { + for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) + { + int32_t x1 = std::min(x0 + 1, input_width - 1); + int32_t y1 = std::min(y0 + 1, input_height - 1); + ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data, + output_shape, output_data); + } + } + } +} + +inline void ResizeBilinearKernel(const float *input_ptr, int32_t depth, float scale, + float *output_ptr) +{ + for (int32_t i = 0; i < depth; i++) + { + *output_ptr += *input_ptr * scale; + output_ptr++; + input_ptr++; + } +} + +inline void ComputeInterpolationValues(const float value, const float scale, + const bool half_pixel_centers, int32_t input_size, + float *scaled_value, int32_t *lower_bound, + int32_t *upper_bound) +{ + if (half_pixel_centers) + { + *scaled_value = (value + 0.5f) * scale - 0.5f; + } + else + { + *scaled_value = value * scale; + } + float scaled_value_floor = std::floor(*scaled_value); + *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0)); + *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1); +} + +inline void ResizeBilinearGeneric(int32_t batches, int32_t input_height, int32_t input_width, + int32_t depth, int32_t output_height, int32_t output_width, + float height_scale, float width_scale, const Shape &input_shape, + const float *input_data, float *output_data, + const bool half_pixel_centers) +{ + memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float)); + + int32_t output_offset = 0; + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + float input_y; + int32_t y0, y1; + ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0, + &y1); + for (int x = 0; x < output_width; ++x) + { + float input_x; + int32_t x0, x1; + ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0, + &x1); + float *output_ptr = &output_data[output_offset]; + + // Run kernel on the 4 corners of the bilinear resize algorithm. + int32_t input_offset = Offset(input_shape, b, y0, x0, 0); + float scale = (1 - (input_y - y0)) * (1 - (input_x - x0)); + const float *input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y0, x1, 0); + scale = (1 - (input_y - y0)) * (input_x - x0); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y1, x0, 0); + scale = (input_y - y0) * (1 - (input_x - x0)); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y1, x1, 0); + scale = (input_y - y0) * (input_x - x0); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + output_offset += depth; + } + } + } +} + +template <typename T> +inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_height, + int32_t input_width, int32_t depth, + int32_t output_height, int32_t output_width, + float height_scale, float width_scale, + const Shape &input_shape, const T *input_data, + T *output_data, const bool half_pixel_centers) +{ + T *output_ptr = &output_data[0]; + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + float input_y; + int32_t y0, y1; + ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0, + &y1); + for (int x = 0; x < output_width; ++x) + { + float input_x; + int32_t x0, x1; + ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0, + &x1); + + int32_t input_offset[4] = { + Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; + float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), + (1 - (input_y - y0)) * (input_x - x0), + (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)}; + + for (int d = 0; d < depth; d++) + { + const T *input_ptr = &input_data[d]; + *output_ptr++ = static_cast<T>( + input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + + input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); + } + } + } + } +} + +void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0)); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3)); + + // Specialize for 2x2 upsample. + if (!params.align_corners && !params.half_pixel_centers && + params.output_height == 2 * input_height && params.output_width == 2 * input_width) + { + ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height, + params.output_width, input_shape, input_data, output_shape, output_data); + } + else + { + float height_scale = static_cast<float>(input_height) / params.output_height; + float width_scale = static_cast<float>(input_width) / params.output_width; + if (params.align_corners && params.output_height > 1) + { + height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1); + } + if (params.align_corners && params.output_width > 1) + { + width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1); + } + + ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height, + params.output_width, height_scale, width_scale, input_shape, input_data, + output_data, params.half_pixel_centers); + } +} + +void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); + + float height_scale = (params.align_corners && params.output_height > 1) + ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) + : (static_cast<float>(input_height) / params.output_height); + + float width_scale = (params.align_corners && params.output_width > 1) + ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) + : (static_cast<float>(input_width) / params.output_width); + + ResizeBilinearGenericSmallChannel<uint8_t>( + batches, input_height, input_width, depth, params.output_height, params.output_width, + height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RESIZEBILINEAR_H__ diff --git a/compute/cker/include/cker/operation/Reverse.h b/compute/cker/include/cker/operation/Reverse.h new file mode 100644 index 000000000..ef4673f21 --- /dev/null +++ b/compute/cker/include/cker/operation/Reverse.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REVERSE_H__ +#define __NNFW_CKER_REVERSE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +void Reverse(int axis, const Shape &input_shape, const Scalar *input_data, const Shape &, + Scalar *output_data) +{ + int outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + + int copy_size = 1; + for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) + { + copy_size *= input_shape.Dims(i); + } + + const int dims_at_axis = input_shape.Dims(axis); + for (int i = 0; i < outer_size; ++i) + { + for (int j = 0; j < dims_at_axis; ++j) + { + const int start_pos = (i * dims_at_axis + j) * copy_size; + Scalar *output_ptr = output_data + start_pos; + int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size; + memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar)); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REVERSE_H__ diff --git a/compute/cker/include/cker/operation/Round.h b/compute/cker/include/cker/operation/Round.h new file mode 100644 index 000000000..a04a741cf --- /dev/null +++ b/compute/cker/include/cker/operation/Round.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ROUND_H__ +#define __NNFW_CKER_ROUND_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline float RoundToNearest(float value) +{ + auto floor_val = std::floor(value); + auto diff = value - floor_val; + if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0))) + { + return floor_val; + } + else + { + return floor_val = floor_val + 1.0f; + } +} + +inline void Round(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + // Note that this implementation matches that of tensorFlow tf.round + // and corresponds to the bankers rounding method. + // cfenv (for fesetround) is not yet supported universally on Android, so + // using a work around. + output_data[i] = RoundToNearest(input_data[i]); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ROUND_H__ diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h new file mode 100644 index 000000000..ab2de94cc --- /dev/null +++ b/compute/cker/include/cker/operation/Select.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SELECT_H__ +#define __NNFW_CKER_SELECT_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +template <typename D, typename T> +void Select(const Shape &input_condition_shape, const D *input_condition_data, + const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape, + const T *input_y_data, const Shape &output_shape, T *output_data) +{ + const int64_t flatsize = + MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape); + for (int64_t i = 0; i < flatsize; ++i) + { + output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i]; + } +} + +template <typename D, typename T> +void RankOneSelect(const Shape &input_condition_shape, const D *input_condition_data, + const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape, + const T *input_y_data, const Shape &output_shape, T *output_data) +{ + const int64_t outer_size = input_condition_shape.FlatSize(); + assert(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0) == outer_size); + const int64_t inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape); + + int64_t offset = 0; + for (int64_t i = 0; i < outer_size; i++) + { + const T *input_data = (input_condition_data[i] != 0) ? input_x_data : input_y_data; + memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T)); + offset += inner_size; + } +} + +template <typename D, typename T> +void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_condition_data, + const Shape &input_x_shape, const T *input_x_data, + const Shape &input_y_shape, const T *input_y_data, + const Shape &output_shape, T *output_data) +{ + assert(input_condition_shape.DimensionsCount() <= 4); + assert(input_x_shape.DimensionsCount() <= 4); + assert(input_y_shape.DimensionsCount() <= 4); + assert(output_shape.DimensionsCount() <= 4); + + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + NdArrayDesc<4> desc_condition; + NdArrayDesc<4> desc_x; + NdArrayDesc<4> desc_y; + NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape, + &desc_condition, &desc_x, &desc_y); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest + // stride, typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for + // the best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c); + const int x_index = SubscriptToIndex(desc_x, b, y, x, c); + const int y_index = SubscriptToIndex(desc_y, b, y, x, c); + output_data[Offset(extended_output_shape, b, y, x, c)] = + input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index]; + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SELECT_H__ diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h new file mode 100644 index 000000000..a072cff8e --- /dev/null +++ b/compute/cker/include/cker/operation/Slice.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SLICE_H__ +#define __NNFW_CKER_SLICE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void Slice(const SliceParams &op_params, const Shape &input_shape, + SequentialTensorWriter<T> *writer) +{ + // TODO(dkalenichenko): This op only supports 4D tensors or smaller. + assert(op_params.begin_count <= 4); + assert(op_params.size_count <= 4); + + const int begin_count = op_params.begin_count; + const int size_count = op_params.size_count; + // We front-pad the begin and size vectors. + const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0]; + const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) ? input_shape.Dims(0) + : start_b + op_params.size[0]; + const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; + const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) + ? input_shape.Dims(1) + : start_h + op_params.size[size_count - 3]; + const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; + const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) + ? input_shape.Dims(2) + : start_w + op_params.size[size_count - 2]; + const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; + const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) + ? input_shape.Dims(3) + : start_d + op_params.size[size_count - 1]; + + for (int in_b = start_b; in_b < stop_b; ++in_b) + { + for (int in_h = start_h; in_h < stop_h; ++in_h) + { + for (int in_w = start_w; in_w < stop_w; ++in_w) + { + const int len = stop_d - start_d; + if (len > 0) + writer->WriteN(Offset(input_shape, in_b, in_h, in_w, start_d), len); + } + } + } +} + +template <typename T> +inline void Slice(const SliceParams &op_params, const Shape &input_shape, const T *input_data, + T *output_data) +{ + SequentialTensorWriter<T> writer(input_data, output_data); + return Slice(op_params, input_shape, &writer); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SLICE_H__ diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h new file mode 100644 index 000000000..0e0f364ba --- /dev/null +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SOFTMAX_H__ +#define __NNFW_CKER_SOFTMAX_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" +#include "cker/eigen/Utils.h" + +#include <Eigen/Core> +#include <fixedpoint/fixedpoint.h> +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +namespace reference +{ + +// Note. This Softmax function supports all of dimensions +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) + { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) + { + sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)); + } + + // Compute result. + for (int c = 0; c < depth; ++c) + { + output_data[i * depth + c] = + std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum; + } + } +} +} + +// Performs softmax along the input of size (input_size * batch_size). +inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta, + float *out) +{ + assert(input_size > 0); + + // For each batch + for (int b = 0; b < batch_size; b++) + { + // Find the max coeff. + float max_coeff = in[0]; + for (int i = 1; i < input_size; i++) + { + if (in[i] > max_coeff) + max_coeff = in[i]; + } + + // Compute the normalized sum of exps. + float exp_sum = 0.0; + for (int i = 0; i < input_size; i++) + { + out[i] = std::exp((in[i] - max_coeff) * beta); + exp_sum += out[i]; + } + + // Divide by the sum of exps. + float reciprocal_sum_exp = 1.f / exp_sum; + for (int i = 0; i < input_size; i++) + { + out[i] *= reciprocal_sum_exp; + } + + // Advance in and out pointers for the next batch. + in += input_size; + out += input_size; + } +} + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + // Validate whether if shapes of input and output are the same + MatchingFlatSize(input_shape, output_shape); + + const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape); + auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape); + // Compute the exponential first, removing the max coefficient for numerical + // stability. + out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta; + // We are separating out the exp function so that exp can be vectorized. + out_mat = out_mat.array().exp(); + // Normalize to get the activations. + Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse(); + out_mat.array().rowwise() *= scale; +} + +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + const int32_t input_beta_multiplier = params.input_multiplier; + const int32_t input_beta_left_shift = params.input_left_shift; + const int diff_min = params.diff_min; + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>; + + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + uint8_t max_in_row = 0; + for (int c = 0; c < depth; ++c) + { + max_in_row = std::max(max_in_row, input_data[i * depth + c]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32_t fixed_sum_of_exps = sum_of_exps.raw(); + int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32_t shifted_sum_minus_one = + static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32_t>(1) << 31)); + + FixedPoint0 shifted_scale = + one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) + { + int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; + if (input_diff >= diff_min) + { + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), + num_bits_over_unit + 31 - 8); + + output_data[i * depth + c] = static_cast<uint8_t>( + std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); + } + else + { + output_data[i * depth + c] = 0; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SOFTMAX_H__ diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h new file mode 100644 index 000000000..feeb358c9 --- /dev/null +++ b/compute/cker/include/cker/operation/SpaceToBatchND.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPACE_TO_BATCH_ND_H__ +#define __NNFW_CKER_SPACE_TO_BATCH_ND_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void SpaceToBatchND(const SpaceToBatchParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_block_shape_shape, + const int32_t *block_shape_data, const Shape &unextended_padding_shape, + const int32_t *paddings_data, const Shape &unextended_output_shape, + T *output_data) +{ + UNUSED_RELEASE(unextended_block_shape_shape); + UNUSED_RELEASE(unextended_padding_shape); + + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int input_batch_size = input_shape.Dims(0); + + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_batch_size = output_shape.Dims(0); + + const int block_shape_height = block_shape_data[0]; + const int block_shape_width = block_shape_data[1]; + const int padding_top = paddings_data[0]; + const int padding_left = paddings_data[2]; + + // For uint8 quantized, the correct padding "zero value" is the output offset. + const int32_t pad_value = params.output_offset; + + for (int out_b = 0; out_b < output_batch_size; ++out_b) + { + int input_batch = out_b % input_batch_size; + int shift_w = (out_b / input_batch_size) % block_shape_width; + int shift_h = (out_b / input_batch_size) / block_shape_width; + for (int out_h = 0; out_h < output_height; ++out_h) + { + for (int out_w = 0; out_w < output_width; ++out_w) + { + T *out = output_data + Offset(output_shape, out_b, out_h, out_w, 0); + if (out_h * block_shape_height + shift_h < padding_top || + out_h * block_shape_height + shift_h >= padding_top + input_height || + out_w * block_shape_width + shift_w < padding_left || + out_w * block_shape_width + shift_w >= padding_left + input_width) + { + // This may not execute correctly when pad_value != 0 and T != uint8. + memset(out, pad_value, depth * sizeof(T)); + } + else + { + const T *in = + input_data + Offset(input_shape, input_batch, + (out_h * block_shape_height + shift_h) - padding_top, + (out_w * block_shape_width + shift_w) - padding_left, 0); + memcpy(out, in, depth * sizeof(T)); + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_BATCH_ND_H__ diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h new file mode 100644 index 000000000..ef679315e --- /dev/null +++ b/compute/cker/include/cker/operation/SpaceToDepth.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__ +#define __NNFW_CKER_SPACE_TO_DEPTH_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void SpaceToDepth(const SpaceToDepthParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + + const int input_depth = input_shape.Dims(3); + const int batch_size = input_shape.Dims(0); + + // Number of continuous values that we can copy in one interation. + const int stride = params.block_size * input_depth; + + for (int batch = 0; batch < batch_size; ++batch) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0); + for (int offset_h = 0; offset_h < params.block_size; ++offset_h) + { + T *dst = output_ptr; + for (int out_w = 0; out_w < output_width; ++out_w) + { + memcpy(dst, input_data, stride * sizeof(T)); + input_data += stride; + dst += output_depth; + } + output_ptr += stride; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ diff --git a/compute/cker/include/cker/operation/Split.h b/compute/cker/include/cker/operation/Split.h new file mode 100644 index 000000000..08a436ee9 --- /dev/null +++ b/compute/cker/include/cker/operation/Split.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPLIT_H__ +#define __NNFW_CKER_SPLIT_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +void Split(const SplitParams ¶ms, const Shape &input_shape, const Scalar *input_data, + const Shape &output_shape, Scalar *const *output_data) +{ + const int split_dimensions = input_shape.DimensionsCount(); + int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis; + int outputs_count = params.num_split; + + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + // For all output arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < split_dimensions; ++i) + { + base_inner_size *= input_shape.Dims(i); + } + + const Scalar *input_ptr = input_data; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < outputs_count; ++i) + { + const int copy_size = output_shape.Dims(axis) * base_inner_size; + memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar)); + input_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPLIT_H__ diff --git a/compute/cker/include/cker/operation/SplitV.h b/compute/cker/include/cker/operation/SplitV.h new file mode 100644 index 000000000..9e46f4b04 --- /dev/null +++ b/compute/cker/include/cker/operation/SplitV.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPLIT_V_H__ +#define __NNFW_CKER_SPLIT_V_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +void SplitV(const SplitVParams ¶ms, const Shape &input_shape, const Scalar *input_data, + std::vector<nnfw::cker::Shape> &output_shapes, Scalar *const *output_data) +{ + const int split_dimensions = input_shape.DimensionsCount(); + int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis; + int outputs_count = params.num_split; + + int64_t split_size = 0; + + for (int i = 0; i < outputs_count; i++) + { + // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions); + for (int j = 0; j < split_dimensions; j++) + { + if (j != axis) + { + MatchingDim(output_shapes[i], j, input_shape, j); + } + } + split_size += output_shapes[i].Dims(axis); + } + + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + // For all output arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < split_dimensions; ++i) + { + base_inner_size *= input_shape.Dims(i); + } + + const Scalar *input_ptr = input_data; + int copy_size = 0; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < outputs_count; ++i) + { + copy_size = output_shapes[i].Dims(axis) * base_inner_size; + memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar)); + input_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPLIT_V_H__ diff --git a/compute/cker/include/cker/operation/SqDiff.h b/compute/cker/include/cker/operation/SqDiff.h new file mode 100644 index 000000000..93428d5fd --- /dev/null +++ b/compute/cker/include/cker/operation/SqDiff.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REDUCESQDIFF_H__ +#define __NNFW_CKER_REDUCESQDIFF_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +#define SQDIFF(N) \ + do \ + { \ + NdArrayDesc<N> input1_desc; \ + NdArrayDesc<N> input2_desc; \ + NdArrayDesc<N> output_desc; \ + SqDiffImpl<T, N>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ + output_data, &input1_desc, &input2_desc, &output_desc); \ + } while (0); + +template <typename T, int N> +void SqDiffImpl(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data, + NdArrayDesc<N> *desc1_in, NdArrayDesc<N> *desc2_in, NdArrayDesc<N> *desc_out) +{ + std::vector<int> input_iter; + input_iter.resize(N); + const auto output_dims = output_shape.DimsData(); + + // Copy dims to desc, calculating strides. + CopyDimsToDesc<N>(output_shape, desc_out); + NdArrayDescsForElementwiseBroadcast<N>(input1_shape, input2_shape, desc1_in, desc2_in); + + do + { + int input1_indx = SubscriptToIndexGeneric(desc1_in, input_iter.data()); + int input2_indx = SubscriptToIndexGeneric(desc2_in, input_iter.data()); + int output_indx = SubscriptToIndexGeneric(desc_out, input_iter.data()); + output_data[output_indx] = (input1_data[input1_indx] - input2_data[input2_indx]) * + (input1_data[input1_indx] - input2_data[input2_indx]); + } while (NextIndex(N, output_dims, input_iter.data())); +} + +template <typename T> +void SqDiff(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data) +{ + UNUSED_RELEASE(output_shape); + assert(input1_shape.DimensionsCount() > 0 && input2_shape.DimensionsCount() > 0 && + output_shape.DimensionsCount() > 0); + int outRank = output_shape.DimensionsCount(); + + switch (outRank) + { + case 4: + SQDIFF(4); + break; + + case 3: + SQDIFF(3); + break; + + case 2: + SQDIFF(2); + break; + + case 1: + SQDIFF(1); + break; + + default: + throw std::runtime_error("Support up to 4-D tensors at present"); + break; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REDUCESQDIFF_H__ diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h new file mode 100644 index 000000000..d5952ae23 --- /dev/null +++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ +#define __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" + +#include "cker/operation/Helper/Tensor.h" +#include "cker/operation/Helper/PhiloxRandom.h" +#include "cker/operation/Helper/RandomOpCpu.h" +#include "cker/operation/Helper/RandomDistributions.h" + +namespace nnfw +{ +namespace cker +{ + +void GenerateKey(Tensor seed, random::PhiloxRandom::Key *out_key, + random::PhiloxRandom::ResultType *out_counter) +{ + // Grab the two seeds + uint32_t seed0; + uint32_t seed1; + + const auto seed_vals = seed.flat<int32_t>(); + + seed0 = seed_vals(0); + seed1 = seed_vals(1); + // Scramble the seeds so that the user doesn't need to worry about which + // part of the seed needs to be strong. + (*out_key)[0] = 0x3ec8f720; + (*out_key)[1] = 0x02461e29; + (*out_counter)[0] = static_cast<uint32_t>(seed0); + (*out_counter)[1] = (*out_counter)[3] = 0; + (*out_counter)[2] = static_cast<uint32_t>(seed1); + const auto mix = random::PhiloxRandom(*out_counter, *out_key)(); + (*out_key)[0] = mix[0]; + (*out_key)[1] = mix[1]; + (*out_counter)[0] = (*out_counter)[1] = 0; + (*out_counter)[2] = mix[2]; + (*out_counter)[3] = mix[3]; +} + +template <typename Device, class Distribution> +void Fill(random::PhiloxRandom random, Tensor *output) +{ + // Build distribution + typedef typename Distribution::ResultElementType T; + + auto flat = output->flat<T>(); + // Reuse the compute kernels from the stateful random ops + functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(), + Distribution()); +} + +inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data, + const Shape &seed_shape, const int *seed_data, + const Shape &output_shape, float *output_data) +{ + Tensor shape_t; + Tensor seed_t; + + shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData()); + shape_t.buffer = (void *)shape_data; + + seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData()); + seed_t.buffer = (void *)seed_data; + + Tensor output_t; + output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData()); + output_t.buffer = output_data; + + random::PhiloxRandom::Key key; + random::PhiloxRandom::ResultType counter; + + GenerateKey(seed_t, &key, &counter); + + Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>( + random::PhiloxRandom(counter, key), &output_t); +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h new file mode 100644 index 000000000..2f1089575 --- /dev/null +++ b/compute/cker/include/cker/operation/StridedSlice.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_STRIDEDSLICE_H__ +#define __NNFW_CKER_STRIDEDSLICE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ +// Use until std::clamp() is available from C++17. +inline int Clamp(const int v, const int lo, const int hi) +{ + assert(!(hi < lo)); + if (hi < v) + return hi; + if (v < lo) + return lo; + return v; +} + +inline void StridedSlicePadIndices(StridedSliceParams *p, int dim_count) +{ + // Add indices and mask bits to fully include extra dimensions + assert(dim_count <= 4); + assert(dim_count >= p->start_indices_count); + assert(p->start_indices_count == p->stop_indices_count); + assert(p->stop_indices_count == p->strides_count); + + const int pad_count = dim_count - p->start_indices_count; + + // Pad indices at start, so move arrays by pad_count. + for (int i = p->start_indices_count - 1; i >= 0; --i) + { + p->strides[i + pad_count] = p->strides[i]; + p->start_indices[i + pad_count] = p->start_indices[i]; + p->stop_indices[i + pad_count] = p->stop_indices[i]; + } + for (int i = 0; i < pad_count; ++i) + { + p->start_indices[i] = 0; + p->stop_indices[i] = 1; + p->strides[i] = 1; + } + + // Pad masks with 0s or 1s as required. + p->shrink_axis_mask <<= pad_count; + p->ellipsis_mask <<= pad_count; + p->new_axis_mask <<= pad_count; + p->begin_mask <<= pad_count; + p->end_mask <<= pad_count; + p->begin_mask |= (1 << pad_count) - 1; + p->end_mask |= (1 << pad_count) - 1; + + p->start_indices_count = dim_count; + p->stop_indices_count = dim_count; + p->strides_count = dim_count; +} + +// Return the index for the first element along that axis. This index will be a +// positive integer between [0, axis_size - 1] that can be used to index +// directly into the data. +inline int StartForAxis(const StridedSliceParams ¶ms, const Shape &input_shape, int axis) +{ + const auto begin_mask = params.begin_mask; + const auto *start_indices = params.start_indices; + const auto *strides = params.strides; + // Begin with the specified index. + int start = start_indices[axis]; + + // begin_mask override + if (begin_mask & 1 << axis) + { + if (strides[axis] > 0) + { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits<int>::lowest(); + } + else + { + // Backward iteration - use the last element. + start = std::numeric_limits<int>::max(); + } + } + + // Handle negative indices + int axis_size = input_shape.Dims(axis); + if (start < 0) + { + start += axis_size; + } + + // Clamping + start = Clamp(start, 0, axis_size - 1); + + return start; +} + +// Return the "real" index for the end of iteration along that axis. This is an +// "end" in the traditional C sense, in that it points to one past the last +// element. ie. So if you were iterating through all elements of a 1D array of +// size 4, this function would return 4 as the stop, because it is one past the +// "real" indices of 0, 1, 2 & 3. +inline int StopForAxis(const StridedSliceParams ¶ms, const Shape &input_shape, int axis, + int start_for_axis) +{ + const auto end_mask = params.end_mask; + const auto shrink_axis_mask = params.shrink_axis_mask; + const auto *stop_indices = params.stop_indices; + const auto *strides = params.strides; + + // Begin with the specified index + const bool shrink_axis = shrink_axis_mask & (1 << axis); + int stop = stop_indices[axis]; + + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has + // already been adjusted for negative indices. + if (shrink_axis) + { + stop = start_for_axis + 1; + } + + // end_mask override + if (end_mask & (1 << axis)) + { + if (strides[axis] > 0) + { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits<int>::max(); + } + else + { + // Backward iteration - use the first element. + stop = std::numeric_limits<int>::lowest(); + } + } + + // Handle negative indices + const int axis_size = input_shape.Dims(axis); + if (stop < 0) + { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (strides[axis] > 0) + { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } + else + { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + + return stop; +} + +inline bool LoopCondition(int index, int stop, int stride) +{ + // True when we have reached the end of an axis and should loop. + return stride > 0 ? index >= stop : index <= stop; +} + +template <typename T> +inline StridedSliceParams +buildStridedSliceParams(const T *begin, const T *end, const T *strides, const uint32_t begin_mask, + const uint32_t end_mask, const uint32_t shrink_axis_mask, + const uint8_t rank) +{ + StridedSliceParams op_params; + op_params.start_indices_count = rank; + op_params.stop_indices_count = rank; + op_params.strides_count = rank; + + for (int i = 0; i < rank; ++i) + { + op_params.start_indices[i] = begin[i]; + op_params.stop_indices[i] = end[i]; + op_params.strides[i] = strides[i]; + + assert(op_params.strides[i] != 0); + } + + op_params.begin_mask = begin_mask; + op_params.ellipsis_mask = 0; // NYI + op_params.end_mask = end_mask; + op_params.new_axis_mask = 0; // NYI + op_params.shrink_axis_mask = shrink_axis_mask; + + assert(sizeof(op_params.begin_mask) * 4 >= rank); + + return op_params; +} + +void checkOutputSize(const StridedSliceParams &op_params, const Shape &input_shape, + const Shape &output_shape, uint32_t rank) +{ + UNUSED_RELEASE(output_shape); + + int32_t shape_size = 0; + + for (uint32_t idx = 0; idx < rank; ++idx) + { + int32_t stride = op_params.strides[idx]; + int32_t begin = StartForAxis(op_params, input_shape, idx); + int32_t end = StopForAxis(op_params, input_shape, idx, begin); + + // When shrinking an axis, the end position does not matter (and can be + // incorrect when negative indexing is used, see Issue #19260). Always use + // begin + 1 to generate a length 1 slice, since begin has + // already been adjusted for negative indices by StartForAxis. + const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx); + if (shrink_axis) + { + end = begin + 1; + } + + int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride)); + dim_shape = dim_shape < 0 ? 0 : dim_shape; + if (!shrink_axis) + { + assert(output_shape.Dims(shape_size) == dim_shape); + shape_size++; + } + } + + assert(output_shape.DimensionsCount() == shape_size); +} + +template <typename T> +inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + + bool optimize = true; + int st_count = op_params.strides_count; + for (int idx = 0; idx < st_count - 1; idx++) + { + const int axis_size = unextended_input_shape.Dims(idx); + const int start = StartForAxis(op_params, unextended_input_shape, idx); + const int stop = StopForAxis(op_params, unextended_input_shape, idx, start); + if ((axis_size != 1) && (start != 0 || stop != 0)) + { + optimize = false; + break; + } + } + + if (optimize) + { + if (op_params.strides[st_count - 1] == 1) + { + const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1); + const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start); + + for (int idx = 0; idx < end - start; idx++) + { + output_data[idx] = input_data[idx + start]; + } + return; + } + } + + // Note that the output_shape is not used herein. + StridedSliceParams params_copy = op_params; + + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + // Reverse and pad to 4 dimensions because that is what the runtime code + // requires (ie. all shapes must be 4D and are given backwards). + StridedSlicePadIndices(¶ms_copy, 4); + + const int start_b = StartForAxis(params_copy, input_shape, 0); + const int stop_b = StopForAxis(params_copy, input_shape, 0, start_b); + const int start_h = StartForAxis(params_copy, input_shape, 1); + const int stop_h = StopForAxis(params_copy, input_shape, 1, start_h); + const int start_w = StartForAxis(params_copy, input_shape, 2); + const int stop_w = StopForAxis(params_copy, input_shape, 2, start_w); + const int start_d = StartForAxis(params_copy, input_shape, 3); + const int stop_d = StopForAxis(params_copy, input_shape, 3, start_d); + + T *out_ptr = output_data; + for (int in_b = start_b; !LoopCondition(in_b, stop_b, params_copy.strides[0]); + in_b += params_copy.strides[0]) + { + for (int in_h = start_h; !LoopCondition(in_h, stop_h, params_copy.strides[1]); + in_h += params_copy.strides[1]) + { + for (int in_w = start_w; !LoopCondition(in_w, stop_w, params_copy.strides[2]); + in_w += params_copy.strides[2]) + { + for (int in_d = start_d; !LoopCondition(in_d, stop_d, params_copy.strides[3]); + in_d += params_copy.strides[3]) + { + *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)]; + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_STRIDEDSLICE_H__ diff --git a/compute/cker/include/cker/operation/Tanh.h b/compute/cker/include/cker/operation/Tanh.h new file mode 100644 index 000000000..8747d52b4 --- /dev/null +++ b/compute/cker/include/cker/operation/Tanh.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TANH_H__ +#define __NNFW_CKER_TANH_H__ + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void Tanh(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + auto input_map = MapAsVector(input_data, input_shape); + auto output_map = MapAsVector(output_data, output_shape); + output_map.array() = input_map.array().tanh(); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TANH_H__ diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h new file mode 100644 index 000000000..1dcdd9b79 --- /dev/null +++ b/compute/cker/include/cker/operation/Tile.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TILE_H__ +#define __NNFW_CKER_TILE_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T, typename M> +void CopyMultipleTimes(const T *in_data, int32_t in_size, M multiplier, T *out_data) +{ + for (M i = 0; i < multiplier; ++i) + { + const T *in_end = in_data + in_size; + T *new_out_data = std::copy(in_data, in_end, out_data); + in_data = out_data; + out_data = new_out_data; + } +} + +template <typename T, typename M> +std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_data, + const M *multipliers, T *out_data, int dimension) +{ + const int dimension_size = in_dimensions.Dims(dimension); + if (dimension == in_dimensions.DimensionsCount() - 1) + { + CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data); + return std::make_pair(dimension_size, + dimension_size * static_cast<int>(multipliers[dimension])); + } + int total_stride_size = 0, total_tiled_stride_size = 0; + const T *copy_from_data = in_data; + T *copy_to_data = out_data; + for (int i = 0; i < dimension_size; ++i) + { + int stride_size = 0, tiled_stride_size = 0; + std::tie(stride_size, tiled_stride_size) = + TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1); + copy_from_data += stride_size; + copy_to_data += tiled_stride_size; + total_stride_size += stride_size; + total_tiled_stride_size += tiled_stride_size; + } + CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1, + out_data + total_tiled_stride_size); + return std::make_pair(total_stride_size, + static_cast<int>(total_tiled_stride_size * multipliers[dimension])); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TILE_H__ diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h new file mode 100644 index 000000000..9d8cd340d --- /dev/null +++ b/compute/cker/include/cker/operation/Transpose.h @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRANSPOSE_H__ +#define __NNFW_CKER_TRANSPOSE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +template <typename T> +void TransposeImpl(const TransposeParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + const int unextended_output_size = unextended_output_shape.DimensionsCount(); + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_size <= 4); + assert(unextended_output_size == params.perm_count); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + const int input_ext_size = 4 - unextended_input_shape.DimensionsCount(); + const int output_ext_size = 4 - unextended_output_size; + + // The perm data is extended to match the output, each index incremented by + // the amount of front padding of the input shape. + int extended_perm[4]; + for (int i = 0; i < output_ext_size; ++i) + { + extended_perm[i] = i; + } + for (int i = 0; i < unextended_output_size; ++i) + { + extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size; + } + + int out_sizes[4]; + // Compute the inverse permutation array so we can do an output centered + // transpose. Also, check to make sure output_dims is matching input_dims. + for (int k = 0; k < 4; k++) + { + out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k); + } + + // Naive transpose loop (iterate on output index and compute input index). + int o[4]; // loop index (on output). + int i[4]; + for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) + { + i[extended_perm[3]] = o[3]; + for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) + { + i[extended_perm[2]] = o[2]; + for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) + { + i[extended_perm[1]] = o[1]; + for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) + { + i[extended_perm[0]] = o[0]; + output_data[Offset(output_shape, o)] = input_data[Offset(input_shape, i)]; + } + } + } + } +} + +template <typename T> +void Transpose(const TransposeParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + // Transpose kernel only does rearranging values not numeric evaluations on + // each cell. It's safe to implement per size of scalar type and this trick + // keeps the total code size in a reasonable range. + switch (sizeof(T)) + { + case 1: + TransposeImpl<int8_t>(params, unextended_input_shape, + reinterpret_cast<const int8_t *>(input_data), unextended_output_shape, + reinterpret_cast<int8_t *>(output_data)); + break; + case 2: + TransposeImpl<int16_t>(params, unextended_input_shape, + reinterpret_cast<const int16_t *>(input_data), unextended_output_shape, + reinterpret_cast<int16_t *>(output_data)); + break; + + case 4: + TransposeImpl<int32_t>(params, unextended_input_shape, + reinterpret_cast<const int32_t *>(input_data), unextended_output_shape, + reinterpret_cast<int32_t *>(output_data)); + break; + case 8: + TransposeImpl<int64_t>(params, unextended_input_shape, + reinterpret_cast<const int64_t *>(input_data), unextended_output_shape, + reinterpret_cast<int64_t *>(output_data)); + break; + } +} +} // namespace reference + +namespace +{ + +bool IsTranspose2DApplicable(const TransposeParams ¶ms, const Shape &input_shape, int *dim0, + int *dim1) +{ + const int dims_cnt = input_shape.DimensionsCount(); + + if (dims_cnt == 2) + { + *dim0 = input_shape.Dims(0); + *dim1 = input_shape.Dims(1); + return true; + } + + const int first_perm = params.perm[0]; + for (int i = 1; i < dims_cnt; ++i) + { + int rebased = params.perm[i] - first_perm; + if (rebased < 0) + { + rebased += dims_cnt; + } + if (rebased != i) + { + return false; + } + } + *dim0 = 1; + *dim1 = 1; + for (int i = 0; i < dims_cnt; ++i) + { + if (i < first_perm) + { + *dim0 *= input_shape.Dims(i); + } + else + { + *dim1 *= input_shape.Dims(i); + } + } + return true; +} + +void RemoveOneSizeDimensions(Shape *input_shape, Shape *output_shape, TransposeParams *params) +{ + const int dims_cnt = input_shape->DimensionsCount(); + assert(params->perm_count == dims_cnt); + + bool foundOneSizeDim = false; + for (int i = 0; i < dims_cnt; ++i) + { + if (input_shape->Dims(i) == 1) + { + foundOneSizeDim = true; + break; + } + } + + // Return here if there is no one size dimension. + if (!foundOneSizeDim) + return; + + // Handle the case where all the dimension size is one. + if (input_shape->FlatSize() == 1) + { + input_shape->Resize(1); + input_shape->SetDim(0, 1); + output_shape->Resize(1); + output_shape->SetDim(0, 1); + params->perm_count = 1; + params->perm[0] = 0; + return; + } + + // Resize input shape. + int new_dims_cnt = 0; + for (int i = 0; i < dims_cnt; ++i) + { + if (input_shape->Dims(i) == 1) + { + continue; + } + input_shape->SetDim(new_dims_cnt, input_shape->Dims(i)); + ++new_dims_cnt; + } + input_shape->Resize(new_dims_cnt); + + // Resize output shape and re-calculate the perm parameter. + TransposeParams new_params; + new_dims_cnt = 0; + for (int i = 0; i < dims_cnt; ++i) + { + if (output_shape->Dims(i) == 1) + { + continue; + } + new_params.perm[new_dims_cnt] = params->perm[i]; + output_shape->SetDim(new_dims_cnt, output_shape->Dims(i)); + ++new_dims_cnt; + } + output_shape->Resize(new_dims_cnt); + new_params.perm_count = new_dims_cnt; + + for (int i = 0; i < new_dims_cnt; ++i) + { + int min_val_idx = -1; + for (int j = 0; j < new_dims_cnt; ++j) + { + if (new_params.perm[j] >= i && + (min_val_idx == -1 || new_params.perm[min_val_idx] > new_params.perm[j])) + { + min_val_idx = j; + } + } + new_params.perm[min_val_idx] = i; + } + *params = new_params; +} + +size_t Flatten(const Shape &input_shape, const Shape &output_shape, const TransposeParams ¶ms, + Shape *non_flatten_input_shape, Shape *non_flatten_output_shape, + TransposeParams *non_flatten_params) +{ + // Calculate the total size of non-flatten dimensions. + int skip_dims_cnt = 0; + size_t flat_size = input_shape.FlatSize(); + for (int i = 0; i < params.perm_count; ++i) + { + if (params.perm[i] == i) + { + flat_size /= input_shape.Dims(i); + ++skip_dims_cnt; + } + else + { + break; + } + } + + // Shrink the shapes and re-calculate the perm parameter. + const int new_dims_cnt = params.perm_count - skip_dims_cnt; + non_flatten_input_shape->Resize(new_dims_cnt); + non_flatten_output_shape->Resize(new_dims_cnt); + non_flatten_params->perm_count = new_dims_cnt; + + for (int i = skip_dims_cnt; i < params.perm_count; ++i) + { + non_flatten_input_shape->SetDim(i - skip_dims_cnt, input_shape.Dims(i)); + non_flatten_output_shape->SetDim(i - skip_dims_cnt, output_shape.Dims(i)); + non_flatten_params->perm[i - skip_dims_cnt] = params.perm[i]; + } + for (int i = 0; i < new_dims_cnt; ++i) + { + int min_val_idx = -1; + for (int j = 0; j < new_dims_cnt; ++j) + { + if (non_flatten_params->perm[j] >= i && + (min_val_idx == -1 || + non_flatten_params->perm[min_val_idx] > non_flatten_params->perm[j])) + { + min_val_idx = j; + } + } + non_flatten_params->perm[min_val_idx] = i; + } + + return flat_size; +} + +} // namespace anonymous (util) + +// Transpose2D only deals with typical 2D matrix transpose ops. +// Perform transpose by transposing 4x4 blocks of the input, proceeding from +// left to right (down the rows) of the input, and then from top to bottom. +template <typename T> +inline void Transpose2D(const Shape &input_shape, const T *input_data, const Shape &output_shape, + T *output_data) +{ + assert(input_shape.DimensionsCount() == 2); + assert(output_shape.DimensionsCount() == 2); + UNUSED_RELEASE(output_shape); + + const int d0 = input_shape.DimsData()[0]; + const int d1 = input_shape.DimsData()[1]; + const int kLines = 4; + const int kSkipSize = (kLines - 1) * d1; + + const T *input = input_data; + + int i = 0; + for (; i <= d0 - kLines; i += kLines) + { + T *output = output_data + i; + + const T *input_ptr = input; + optimized_ops_preload_l1_keep(input_ptr); + input_ptr += d1; + optimized_ops_preload_l1_keep(input_ptr); + input_ptr += d1; + optimized_ops_preload_l1_keep(input_ptr); + input_ptr += d1; + optimized_ops_preload_l1_keep(input_ptr); + + int j = 0; + for (; j <= d1 - kLines; j += kLines) + { + input_ptr = input; + const T a00 = input_ptr[0]; + const T a01 = input_ptr[1]; + const T a02 = input_ptr[2]; + const T a03 = input_ptr[3]; + input_ptr += d1; + const T a10 = input_ptr[0]; + const T a11 = input_ptr[1]; + const T a12 = input_ptr[2]; + const T a13 = input_ptr[3]; + input_ptr += d1; + const T a20 = input_ptr[0]; + const T a21 = input_ptr[1]; + const T a22 = input_ptr[2]; + const T a23 = input_ptr[3]; + input_ptr += d1; + const T a30 = input_ptr[0]; + const T a31 = input_ptr[1]; + const T a32 = input_ptr[2]; + const T a33 = input_ptr[3]; + + output[0] = a00; + output[1] = a10; + output[2] = a20; + output[3] = a30; + output += d0; + + output[0] = a01; + output[1] = a11; + output[2] = a21; + output[3] = a31; + output += d0; + + output[0] = a02; + output[1] = a12; + output[2] = a22; + output[3] = a32; + output += d0; + + output[0] = a03; + output[1] = a13; + output[2] = a23; + output[3] = a33; + output += d0; + + input += kLines; + } + if (j == d1) + { + input += kSkipSize; + } + else + { + for (int p = 0; p < kLines; ++p) + { + for (int q = 0; q < d1 - j; ++q) + { + *(output + q * d0 + p) = *(input + p * d1 + q); + } + } + input += (d1 - j) + kSkipSize; + } + } + for (; i < d0; ++i) + { + T *output = output_data + i; + for (int j = 0; j < d1; ++j) + { + *output = *input; + output += d0; + ++input; + } + } +} + +// TODO(alanchiao): see if we can reduce the number +// of lines of code in branching without affecting latency. +template <typename T> +inline void Transpose3D(const TransposeParams ¶ms, const Shape &input_shape, + const T *input_data, const Shape &, T *output_data) +{ + int s2, s3; + s2 = input_shape.Dims(1); + s3 = input_shape.Dims(2); + + int p1 = 0; + int p2 = 0; + int p3 = 0; + + if (params.perm[0] == 2) + { + p1 = 1; + } + else if (params.perm[1] == 2) + { + p2 = 1; + } + else + { + p3 = 1; + } + + if (params.perm[0] == 1) + { + p1 = s3; + } + else if (params.perm[1] == 1) + { + p2 = s3; + } + else + { + p3 = s3; + } + + if (params.perm[0] == 0) + { + p1 = s2 * s3; + } + else if (params.perm[1] == 0) + { + p2 = s2 * s3; + } + else + { + p3 = s2 * s3; + } + + int o_s[3]; + o_s[0] = input_shape.Dims(params.perm[0]); + o_s[1] = input_shape.Dims(params.perm[1]); + o_s[2] = input_shape.Dims(params.perm[2]); + + for (int i1 = 0; i1 < o_s[0]; ++i1) + { + for (int i2 = 0; i2 < o_s[1]; ++i2) + { + for (int i3 = 0; i3 < o_s[2]; ++i3) + { + const int i = i1 * p1 + i2 * p2 + i3 * p3; + const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3; + output_data[o] = input_data[i]; + } + } + } +} + +template <typename T> +void TransposeImpl(const TransposeParams ¶ms, const Shape &input_shape, const T *input_data, + const Shape &output_shape, T *output_data) +{ + const int dims_cnt = input_shape.DimensionsCount(); + + int dim0, dim1; + if (IsTranspose2DApplicable(params, input_shape, &dim0, &dim1)) + { + Transpose2D(Shape({dim0, dim1}), input_data, Shape({dim1, dim0}), output_data); + return; + } + + // TODO(b/141217325): notably Eigen is better suited for + // larger inputs whereas Transpose3D is generally + // better for smaller ones. + // + // E.g. on Nexus 5, Eigen is better for size 96^3 and up + // and Transpose3D is better for 72^3 and down. + // + // 96^3 is not mobile-friendly for certain usecases + // (e.g. model used in beam search for seq2seq) but is in others. + // Consider tradeoffs. + if (dims_cnt == 3) + { + Transpose3D(params, input_shape, input_data, output_shape, output_data); + return; + } + + // Reroute to the reference version if an optimized method for the given data + // is not available. + reference::Transpose(params, input_shape, input_data, output_shape, output_data); +} + +template <typename T> +void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape, + const T *input_data, const Shape &unshrunk_output_shape, T *output_data) +{ + const int output_size = unshrunk_output_shape.DimensionsCount(); + assert(unshrunk_input_shape.DimensionsCount() <= 4); + assert(output_size <= 4); + assert(output_size == unshrunk_params.perm_count); + + Shape shrunk_input_shape = Shape(unshrunk_input_shape); + + Shape shrunk_output_shape = Shape(unshrunk_output_shape); + + TransposeParams shrunk_params = unshrunk_params; + + // Reduce any dimensions that have one size. Lower transpose op usually + // performs better since memory access patterns will be improved. + RemoveOneSizeDimensions(&shrunk_input_shape, &shrunk_output_shape, &shrunk_params); + + // Handle identity cases. + // TODO(b/140779653): Add an optimization pass in the conversion process to + // remove transpose op nodes where they do nothing like the below one. + bool identical = true; + for (int i = 0; i < shrunk_params.perm_count; ++i) + + { + if (shrunk_params.perm[i] != i) + + { + identical = false; + break; + } + } + if (identical) + { + memcpy(output_data, input_data, unshrunk_input_shape.FlatSize() * sizeof(T)); + return; + } + + // Reduce dimensions by flattening. + if (shrunk_params.perm[0] == 0 && output_size >= 3) + + { + Shape non_flatten_input_shape; + Shape non_flatten_output_shape; + TransposeParams non_flatten_params; + const int total_size = shrunk_input_shape.FlatSize(); + + const int non_flatten_size = + Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params, + + &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params); + assert(non_flatten_params.perm[0] != 0); + + for (int i = 0; i < total_size; i += non_flatten_size) + { + TransposeImpl(non_flatten_params, non_flatten_input_shape, input_data + i, + non_flatten_output_shape, output_data + i); + } + return; + } + + // Call non-flattened case. + TransposeImpl(shrunk_params, shrunk_input_shape, input_data, shrunk_output_shape, + + output_data); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRANSPOSE_H__ diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h new file mode 100644 index 000000000..7db3a1179 --- /dev/null +++ b/compute/cker/include/cker/operation/TransposeConv.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRANSPOSE_CONV_H__ +#define __NNFW_CKER_TRANSPOSE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +inline void TransposeConv(const TransposeConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &output_shape, float *output_data) +{ + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + // Although transpose convolution simplifies to convolution with transposed + // weights for strides of 1, non-unitary striding complicates matters. To + // keep this reference implementation as clear as possible, we use a + // "scatter" access pattern, where we loop through all the input elements, + // computing their influence on the output, rather than looping through the + // output elements in the typical "gather" access pattern of a conv. We + // therefore must initialize the output array to zero. + const int num_elements = output_shape.FlatSize(); + for (int i = 0; i < num_elements; i++) + { + output_data[i] = 0.0f; + } + + // Loop through input elements one at a time. + for (int batch = 0; batch < batches; ++batch) + { + for (int in_y = 0; in_y < input_height; ++in_y) + { + for (int in_x = 0; in_x < input_width; ++in_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + // Loop through the output elements it will influence + const int out_x_origin = (in_x * stride_width) - pad_width; + const int out_y_origin = (in_y * stride_height) - pad_height; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + // Compute output element location + const int out_x = out_x_origin + filter_x; + const int out_y = out_y_origin + filter_y; + // We cannot accumulate out of bounds + if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) && + (out_y < output_height)) + { + float input_value = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] += + input_value * filter_value; + } + } + } + } + } + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRANSPOSE_CONV_H__ diff --git a/compute/cker/include/cker/operation/Unpack.h b/compute/cker/include/cker/operation/Unpack.h new file mode 100644 index 000000000..242aadf46 --- /dev/null +++ b/compute/cker/include/cker/operation/Unpack.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UNPACK_H__ +#define __NNFW_CKER_UNPACK_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +void Unpack(const UnpackParams ¶ms, const Shape &input_shape, const Scalar *input_data, + const Shape &output_shape, Scalar *const *output_datas) +{ + const int dimensions = input_shape.DimensionsCount(); + const int outputs_count = params.num_split; + + int outer_size = 1; + for (int i = 0; i < params.axis; i++) + { + outer_size *= input_shape.Dims(i); + } + int copy_size = 1; + for (int i = params.axis + 1; i < dimensions; i++) + { + copy_size *= input_shape.Dims(i); + } + assert(output_shape.FlatSize() == copy_size * outer_size); + UNUSED_RELEASE(output_shape); + + for (int i = 0; i < outputs_count; ++i) + { + for (int k = 0; k < outer_size; k++) + { + Scalar *output_ptr = output_datas[i] + copy_size * k; + int loc = k * outputs_count * copy_size + i * copy_size; + memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar)); + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UNPACK_H__ diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h new file mode 100644 index 000000000..912b01a64 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__ +#define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__ + +#include <functional> +#include <limits> +#include <utility> +#include "cker/neon/neon_check.h" +#include "cker/operation/reference/BinaryArithmeticOps.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "fixedpoint/fixedpoint.h" + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +template <typename ElementwiseF, typename ScalarBroadcastF, typename T> +inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, bool switch_inputs, + const Shape & /* unswitched_input1_shape */, + const T *unswitched_input1_data, + const Shape & /* unswitched_input2_shape */, + const T *unswitched_input2_data, + const Shape & /* output_shape */, T *output_data, + ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f) +{ + const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data; + const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. + T *output_data_ptr = output_data; + const T *input1_data_ptr = input1_data; + const T *input2_data_reset = input2_data; + // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared + // between input shapes. y3 for input 1 is always broadcast, and so the + // dimension there is 1, whereas optionally y1 might be broadcast for input 2. + // Put another way, + // input1.shape.FlatSize = y0 * y1 * y2 * y4, + // input2.shape.FlatSize = y0 * y2 * y3 * y4. + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + if (y4 > 1) + { + // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner + // dimension. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + for (int i3 = 0; i3 < y3; ++i3) + { + elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + // We have broadcast y4 of input1 data y3 times, and now move on. + input1_data_ptr += y4; + } + } + // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. + input2_data_reset = input2_data_ptr; + } + } + else + { + // Special case of y4 == 1, in which the innermost loop is a single element + // and can be combined with the next (y3) as an inner broadcast. + // + // Note that this handles the case of pure scalar broadcast when + // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar + // broadcast with batch (as y2 > 1). + // + // NOTE The process is the same as the above general case except simplified + // for y4 == 1 and the loop over y3 is contained within the + // AddScalarBroadcast function. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y3; + output_data_ptr += y3; + input1_data_ptr += 1; + } + } + input2_data_reset = input2_data_ptr; + } + } +} + +inline int32_t quant8_sum(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, + const uint8_t input2_data) +{ + const int32_t input1_val = params.input1_offset + input1_data; + const int32_t input2_val = params.input2_offset + input2_data; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + return clamped_output; +} + +inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) +{ + int i = 0; + +#ifdef USE_NEON + const uint8x8_t output_activation_min_vector = vdup_n_u8(params.quantized_activation_min); + const uint8x8_t output_activation_max_vector = vdup_n_u8(params.quantized_activation_max); + for (; i <= size - 8; i += 8) + { + const uint8x8_t input1_val_original = vld1_u8(input1_data + i); + const uint8x8_t input2_val_original = vld1_u8(input2_data + i); + const int16x8_t input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); + const int16x8_t input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset)); + const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset)); + const int16x4_t input1_val_high = vget_high_s16(input1_val); + const int16x4_t input1_val_low = vget_low_s16(input1_val); + const int16x4_t input2_val_high = vget_high_s16(input2_val); + const int16x4_t input2_val_low = vget_low_s16(input2_val); + int32x4_t x11 = vmovl_s16(input1_val_low); + int32x4_t x12 = vmovl_s16(input1_val_high); + int32x4_t x21 = vmovl_s16(input2_val_low); + int32x4_t x22 = vmovl_s16(input2_val_high); + const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift); + x11 = vshlq_s32(x11, left_shift_dup); + x12 = vshlq_s32(x12, left_shift_dup); + x21 = vshlq_s32(x21, left_shift_dup); + x22 = vshlq_s32(x22, left_shift_dup); + x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier); + x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier); + const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift); + const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift); + x11 = vshlq_s32(x11, input1_shift_dup); + x12 = vshlq_s32(x12, input1_shift_dup); + x21 = vshlq_s32(x21, input2_shift_dup); + x22 = vshlq_s32(x22, input2_shift_dup); + int32x4_t s1 = vaddq_s32(x11, x21); + int32x4_t s2 = vaddq_s32(x12, x22); + s1 = vqrdmulhq_n_s32(s1, params.output_multiplier); + s2 = vqrdmulhq_n_s32(s2, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + s1 = RoundingDivideByPOT(s1, -params.output_shift); + s2 = RoundingDivideByPOT(s2, -params.output_shift); + const int16x4_t s1_narrowed = vmovn_s32(s1); + const int16x4_t s2_narrowed = vmovn_s32(s2); + const int16x8_t s = + vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); + const uint8x8_t clamped = vmax_u8(output_activation_min_vector, + vmin_u8(output_activation_max_vector, vqmovun_s16(s))); + vst1_u8(output_data + i, clamped); + } +#endif // NEON + for (; i < size; ++i) + { + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + +struct BinaryOpFuncAddFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vaddq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a + b; } +}; + +struct BinaryOpFuncSubFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vsubq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a - b; } +}; + +struct BinaryOpFuncMulFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vmulq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a * b; } +}; + +struct BinaryOpFuncDivFloat +{ +#ifdef USE_NEON +#ifdef __aarch64__ + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vdivq_f32(a, b); + } +#endif // __aarch64__ +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a / b; } +}; + +template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs +{ + template <typename T> static inline T calculate(const T &a, const T &b) + { + return BASEOPERATOR::calculate(b, a); + } +}; + +struct BinaryOpActivationFloatNone +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + (void)ceilingParam; // suppress unused argument warning + return value; + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + (void)floorParam; + return value; + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + (void)ceilingParam; + return value; + } + static inline float applyFloor(const float value, const float floorParam) + { + (void)floorParam; + return value; + } +}; + +struct BinaryOpActivationFloatMax +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + (void)ceilingParam; // suppress unused argument warning + return value; + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + return vmaxq_f32(value, floorParam); + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + (void)ceilingParam; + return value; + } + static inline float applyFloor(const float value, const float floorParam) + { + return std::max(value, floorParam); + } +}; + +struct BinaryOpActivationFloatMinMax +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + return vminq_f32(value, ceilingParam); + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + return vmaxq_f32(value, floorParam); + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + return std::min(value, ceilingParam); + } + static inline float applyFloor(const float value, const float floorParam) + { + return std::max(value, floorParam); + } +}; + +template <class OPERATOR, class ACTIVATION> +inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const float *input1_data, const float *input2_data, + float *output_data) +{ + int i = 0; + +#ifdef USE_NEON + const auto activation_min = vdupq_n_f32(params.float_activation_min); + const auto activation_max = vdupq_n_f32(params.float_activation_max); + for (; i <= size - 16; i += 16) + { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = OPERATOR::calculate(a10, a20); + auto x1 = OPERATOR::calculate(a11, a21); + auto x2 = OPERATOR::calculate(a12, a22); + auto x3 = OPERATOR::calculate(a13, a23); + x0 = ACTIVATION::applyFloor(x0, activation_min); + x1 = ACTIVATION::applyFloor(x1, activation_min); + x2 = ACTIVATION::applyFloor(x2, activation_min); + x3 = ACTIVATION::applyFloor(x3, activation_min); + x0 = ACTIVATION::applyCeiling(x0, activation_max); + x1 = ACTIVATION::applyCeiling(x1, activation_max); + x2 = ACTIVATION::applyCeiling(x2, activation_max); + x3 = ACTIVATION::applyCeiling(x3, activation_max); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) + { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = OPERATOR::calculate(a1, a2); // vaddq + auto x_clamped = + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + vst1q_f32(output_data + i, x_clamped); + } +#endif // USE_NEON + for (; i < size; i++) + { + auto x = OPERATOR::calculate(input1_data[i], input2_data[i]); + output_data[i] = ACTIVATION::applyCeiling( + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); + } +} + +// Broadcast binary op template that can often be used for inner loop +// This function will handle scalar_value (LHS) and vector_values (RHS). +// Since it's a float function, input params does not matter here. +template <class OPERATOR, class ACTIVATION> +inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + const float broadcast_value, const float *input2_data, + float *output_data) +{ + int i = 0; + +#ifdef USE_NEON + const auto activation_min = vdupq_n_f32(params.float_activation_min); + const auto activation_max = vdupq_n_f32(params.float_activation_max); + const auto broadcast_value_dup = vdupq_n_f32(broadcast_value); + for (; i <= size - 16; i += 16) + { + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = OPERATOR::calculate(broadcast_value_dup, a20); + auto x1 = OPERATOR::calculate(broadcast_value_dup, a21); + auto x2 = OPERATOR::calculate(broadcast_value_dup, a22); + auto x3 = OPERATOR::calculate(broadcast_value_dup, a23); + x0 = ACTIVATION::applyFloor(x0, activation_min); + x1 = ACTIVATION::applyFloor(x1, activation_min); + x2 = ACTIVATION::applyFloor(x2, activation_min); + x3 = ACTIVATION::applyFloor(x3, activation_min); + x0 = ACTIVATION::applyCeiling(x0, activation_max); + x1 = ACTIVATION::applyCeiling(x1, activation_max); + x2 = ACTIVATION::applyCeiling(x2, activation_max); + x3 = ACTIVATION::applyCeiling(x3, activation_max); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) + { + auto a2 = vld1q_f32(input2_data + i); + auto x = OPERATOR::calculate(broadcast_value_dup, a2); + auto x_clamped = + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + vst1q_f32(output_data + i, x_clamped); + } +#endif // USE_NEON + for (; i < size; i++) + { + auto x = OPERATOR::calculate(broadcast_value, input2_data[i]); + output_data[i] = ACTIVATION::applyCeiling( + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); + } +} + +using BinaryOpImplFloatFuncs = + std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), + void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>; + +template <class FUNC> +inline BinaryOpImplFloatFuncs +getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam ¶ms) +{ + if (params.float_activation_max == std::numeric_limits<float>::max()) + if (params.float_activation_min == std::numeric_limits<float>::lowest()) + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>); + else + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>); + else + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>); +} + +inline void AddQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const uint8_t *input1_data, const Shape &input2_shape, + const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +{ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); +} + +inline void Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, float *output_data) +{ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +} + +// Scalar-broadcast add that can be used for inner loop of more general +// broadcast add, so that, for example, scalar-broadcast with batch will still +// be fast. +inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, + uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) +{ + int i = 0; + int32_t clamped_output; + for (; i < size; ++i) + { + clamped_output = quant8_sum(params, broadcast_value, input2_data[i]); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + +inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam ¶ms, + const Shape &input1_shape, const uint8_t *input1_data, + const Shape &input2_shape, const uint8_t *input2_data, + const Shape &output_shape, uint8_t *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + { + const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> + fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, + const uint8_t &b) -> uint8_t { + return static_cast<uint8_t>(quant8_sum(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, + input2_shape, input2_data, output_shape, + output_data, fn); + } + else + { + BinaryBroadcastFiveFold( + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, + uint8_t *)>(AddElementwiseQuant8), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, + uint8_t *)>(AddScalarBroadcastQuant8)); + } +} + +inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + { + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a + b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + } + else + { + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params); + + BinaryBroadcastFiveFold(params, params.broadcast_category == + BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data, implFuncs.first, implFuncs.second); + } +} + +inline void Sub(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, float *output_data) +{ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +} + +inline void BroadcastSubDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast) + { + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params); + BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) + { + auto implFuncs = + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params); + BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else + { + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a - b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + } +} + +inline int32_t quant8_mul(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, + const uint8_t input2_data) +{ + const int32_t input1_val = params.input1_offset + input1_data; + const int32_t input2_val = params.input2_offset + input2_data; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + + return clamped_output; +} + +inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) +{ + int i = 0; + +#ifdef USE_NEON + const auto input1_offset_vector = vdupq_n_s16(params.input1_offset); + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = vdup_n_u8(params.quantized_activation_min); + const auto output_activation_max_vector = vdup_n_u8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); + for (; i <= size - 8; i += 8) + { + // We load / store 8 at a time, multiplying as two sets of 4 int32s. + const auto input1_val_original = vld1_u8(input1_data + i); + const auto input2_val_original = vld1_u8(input2_data + i); + const auto input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); + const auto input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector); + const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); + + const auto input1_val_low = vget_low_s16(input1_val); + const auto input1_val_high = vget_high_s16(input1_val); + const auto input2_val_low = vget_low_s16(input2_val); + const auto input2_val_high = vget_high_s16(input2_val); + + auto p1 = vmull_s16(input2_val_low, input1_val_low); + auto p2 = vmull_s16(input2_val_high, input1_val_high); + + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); + + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); + const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); + const auto clamped = vmax_u8(output_activation_min_vector, + vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + vst1_u8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) + { + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + +inline void MulQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const uint8_t *input1_data, const Shape &input2_shape, + const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +{ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); +} + +inline void Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, float *output_data) +{ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +} + +inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) +{ + int i = 0; + int32_t clamped_output; + for (; i < size; ++i) + { + clamped_output = quant8_mul(params, broadcast_value, input2_data[i]); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + +inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam ¶ms, + const Shape &input1_shape, const uint8_t *input1_data, + const Shape &input2_shape, const uint8_t *input2_data, + const Shape &output_shape, uint8_t *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + { + const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> + fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, + const uint8_t &b) -> uint8_t { + return static_cast<uint8_t>(quant8_mul(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, + input2_shape, input2_data, output_shape, + output_data, fn); + return; + } + BinaryBroadcastFiveFold( + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, + uint8_t *)>(MulElementwiseQuant8), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, + uint8_t *)>(MulSimpleBroadcastQuant8)); +} + +inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + { + // TODO: Use GetBinaryArithmeticFn + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a * b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + return; + } + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); + BinaryBroadcastFiveFold(params, params.broadcast_category == + BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data, implFuncs.first, implFuncs.second); +} + +inline void Div(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, float *output_data) +{ +#ifdef __aarch64__ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +#else + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a / b; }; + reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, fn); +#endif // __aarch64__ +} + +inline void BroadcastDivDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ +#ifdef __aarch64__ + if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast) + { + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params); + BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) + { + auto implFuncs = + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params); + BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else +#endif // __aarch64__ + { + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a / b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__ diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h new file mode 100644 index 000000000..0f620146c --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/Conv.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_CONV_H__ +#define __NNFW_CKER_OPTIMIZED_CONV_H__ + +#include "OptimizedUtils.h" + +#include "cker/eigen/EigenSupport.h" +#include "cker/eigen/Utils.h" +#include "cker/gemmlowp/GEMMSupport.h" +#include "cker/neon/neon_check.h" +#include "cker/operation/Common.h" +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <public/gemmlowp.h> +#include <public/map.h> +#include <fixedpoint/fixedpoint.h> + +#include <vector> +#include <tuple> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +struct GemmlowpOutputPipeline +{ + typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap; + typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>, + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, + gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> + Pipeline; + static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset, + int32_t output_multiplier, int output_left_shift, + int32_t output_activation_min, int32_t output_activation_max) + { + ColVectorMap bias_vector(bias_data, output_rows); + gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; + bias_addition_stage.bias_vector = bias_vector; + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage; + quantize_down_stage.result_offset_after_shift = output_offset; + quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; + quantize_down_stage.result_exponent = output_left_shift; + gemmlowp::OutputStageClamp clamp_stage; + clamp_stage.min = output_activation_min; + clamp_stage.max = output_activation_max; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, + saturating_cast_stage); + } +}; + +inline void AddBiasAndEvalActivationFunction(float output_activation_min, + float output_activation_max, const Shape &bias_shape, + const float *bias_data, const Shape &array_shape, + float *array_data) +{ + BiasAndClamp(output_activation_min, output_activation_max, bias_shape.FlatSize(), bias_data, + array_shape.FlatSize(), array_data); +} + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, + const Shape &im2col_shape, uint8_t *im2col_data) +{ + gemmlowp::GemmContext *gemm_context = gemm_support::GetGemmLowpContext(); + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const uint8_t *gemm_input_data = nullptr; + const Shape *gemm_input_shape = nullptr; + const int filter_width = filter_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; + const bool need_im2col = + stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; + if (need_dilated_im2col) + { + assert(im2col_data); + const int input_zero_point = -input_offset; + assert(input_zero_point >= 0); + assert(input_zero_point <= 255); + DilatedIm2col(params, input_zero_point, input_shape, input_data, filter_shape, output_shape, + im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; + } + else if (need_im2col) + { + assert(im2col_data); + const int input_zero_point = -input_offset; + assert(input_zero_point >= 0); + assert(input_zero_point <= 255); + Im2col(params, filter_height, filter_width, input_zero_point, input_shape, input_data, + im2col_shape, im2col_data); + gemm_input_data = im2col_data; + gemm_input_shape = &im2col_shape; + } + else + { + gemm_input_data = input_data; + gemm_input_shape = &input_shape; + } + + const int gemm_input_rows = gemm_input_shape->Dims(3); + // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784). + // The root cause has not yet been identified though. Same applies below for + // the other calls commented out. This is a partial rollback of cl/196819423. + // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3); + const int gemm_input_cols = + gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2); + const int filter_rows = filter_shape.Dims(0); + // See b/79927784. + // const int filter_cols = FlatSizeSkipDim(filter_shape, 0); + const int filter_cols = filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3); + const int output_rows = output_shape.Dims(3); + // See b/79927784. + // const int output_cols = FlatSizeSkipDim(output_shape, 3); + const int output_cols = output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2); + assert(output_rows == filter_rows); + assert(output_cols == gemm_input_cols); + assert(filter_cols == gemm_input_rows); + assert(bias_shape.FlatSize() == output_rows); + UNUSED_RELEASE(bias_shape); + gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix( + filter_data, filter_rows, filter_cols); + gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix( + gemm_input_data, gemm_input_rows, gemm_input_cols); + gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows, + output_cols); + const auto &output_pipeline = + GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max); + gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, + output_pipeline); +} + +} // namespace optimized + +namespace multithreaded +{ +namespace +{ +template <class T> class EigenTensorConvFunctor +{ +private: + Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding) + { + switch (padding) + { + case PaddingType::kValid: + return Eigen::PADDING_VALID; + case PaddingType::kSame: + return Eigen::PADDING_SAME; + case PaddingType::kNone: + assert(false); // should never get here. + return Eigen::PADDING_VALID; + } + return Eigen::PADDING_SAME; // Prevent compiler warning about missing + // return + } + +public: + void operator()(const Eigen::ThreadPoolDevice &device, const T *input_data, int input_batches, + int input_height, int input_width, int input_depth, const T *filter_data, + int filter_height, int filter_width, int filter_count, int stride_rows, + int stride_cols, int pad_height, int pad_width, nnfw::cker::PaddingType padding, + T *output_data, int output_height, int output_width) + { + const bool is_1x1_kernel = + (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); + const bool is_same_height_width = + (filter_height == input_height && filter_width == input_width && pad_width == 0 && + pad_height == 0); + if (is_1x1_kernel || is_same_height_width) + { + // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication. + // - output (input_batches * conv_width, filter_count) + // - input (input_batches * conv_width, input_depth) + // - filter (input_depth, filter_count) + // is_same_height_width: If the input data and filter have the same height/width, the 2D + // convolution is reduced to matrix multiplication. + // - output (input_batches, filter_count) + // - input (input_batches, filter_width * filter_height * input_depth) + // - filter (filter_width * filter_height * input_depth, filter_count) + const int conv_width = output_height * output_width; + int io_col = input_batches; + int filter_col = input_depth * filter_width * filter_height; + if (is_1x1_kernel) + { + io_col *= conv_width; + } + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; + dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); + eigen_support::EigenMatrix output(output_data, io_col, filter_count); + eigen_support::ConstEigenMatrix input(input_data, io_col, filter_col); + eigen_support::ConstEigenMatrix filter(filter_data, filter_col, filter_count); + eigen_support::MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, filter, + dim_pair); + } + else + { + eigen_support::EigenTensor output(output_data, input_batches, output_height, output_width, + filter_count); + eigen_support::ConstEigenTensor input(input_data, input_batches, input_height, input_width, + input_depth); + eigen_support::ConstEigenTensor filter(filter_data, filter_height, filter_width, input_depth, + filter_count); + output.device(device) = Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows, + RuntimePadding2EigenPadding(padding)); + } + } +}; +} // namespace + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice(); + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const PaddingType padding = params.padding_type; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + EigenTensorConvFunctor<float> conv_functor; + conv_functor(device, input_data, batches, input_height, input_width, input_depth, filter_data, + filter_height, filter_width, output_depth, stride_height, stride_width, pad_height, + pad_width, padding, output_data, output_height, output_width); + + optimized::AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max, + bias_shape, bias_data, output_shape, output_data); +} + +} // namespace multithreaded +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_CONV_H__ diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h new file mode 100644 index 000000000..d383b126d --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h @@ -0,0 +1,2123 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__ +#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" + +#include <fixedpoint/fixedpoint.h> +#include <public/gemmlowp.h> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// Implementation of quantized DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct QuantizedDepthwiseConvKernel +{ +}; + +#ifdef USE_NEON +template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8x2_t filter_u8; + filter_u8.val[0] = vld1_u8(filter_ptr); + filter_u8.val[1] = vld1_u8(filter_ptr + 8); + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += input_ptr_increment; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[0].val[i] = + vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); + acc[1].val[i] = + vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) + { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + acc[0] = vld1q_s32(acc_buffer_ptr); + acc[1] = vld1q_s32(acc_buffer_ptr + 4); + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc[0]); + vst1q_s32(acc_buffer_ptr + 4, acc[1]); + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = + vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4x2_t input_dup2 = vzip_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + int outp = 0; + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input_dup2); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) + { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x2_t acc = vld1_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); + acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); + acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); + acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vmlal_n_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + } + input_ptr += 16; + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters, add filter_offset. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // We will have to duplicate bytes in a NEON register, 3-fold. + // We will do that by register-level table-look-up using VTBL instructions. + // Here we prepare the registers containing the table-lookup indices. + static const uint8_t dup3_indices_array[3][8] = { + {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; + uint8x8_t dup3_indices[3]; + for (int i = 0; i < 3; i++) + { + dup3_indices[i] = vld1_u8(dup3_indices_array[i]); + } + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const uint8_t *local_filter_ptr = filter_ptr; + const uint8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters, add filter_offset. + int16x8_t filter[3]; + uint8x8x3_t filter_u8; + filter_u8.val[0] = vld1_u8(local_filter_ptr); + filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); + filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); + local_filter_ptr += 24; + for (int i = 0; i < 3; i++) + { + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + // Load the inputs, duplicate 3-fold, add input_offset. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + + uint8x8_t input_u8_dup3[3]; + for (int i = 0; i < 3; i++) + { + input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); + } + int16x8_t input_dup3[3]; + for (int i = 0; i < 3; i++) + { + const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); + input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4x3_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); + } + // Multiply-accumulate + for (int j = 0; j < 3; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); + } + acc_buffer_ptr += 24; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const uint16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 3; i++) + { + const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + local_filter_ptr += 3; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const uint8_t *local_filter_ptr = filter_ptr; + const uint8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters, add filter_offset. + int16x8_t filter[2]; + uint8x8x2_t filter_u8; + filter_u8.val[0] = vld1_u8(local_filter_ptr); + filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); + local_filter_ptr += 16; + for (int i = 0; i < 2; i++) + { + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + // Load the inputs, add input_offset, duplicate 2-fold. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Load the accumulators from acc_buffer. + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Multiply-accumulate. + for (int j = 0; j < 2; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); + } + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the inputs. + const uint16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 2; i++) + { + const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + local_filter_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const uint8_t *local_filter_ptr = filter_ptr; + const uint8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) + { + // Load the filters, add filter_offset. + uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0); + uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1); + local_filter_ptr += 16; + int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); + int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); + filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); + filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); + // Load the inputs, add input_offset. + uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0); + uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1); + local_input_ptr += 16; + int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); + int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); + acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); + acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + acc_buffer_ptr += 16; + } + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); + local_filter_ptr += 8; + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const uint16_t input_val = *local_input_ptr++ + input_offset; + const uint16_t filter_val = *local_filter_ptr++ + filter_offset; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8_t filter_u8[2]; + for (int i = 0; i < 2; i++) + { + filter_u8[i] = vld1_u8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); + } + for (int i = 0; i < 2; i++) + { + filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) + { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += input_ptr_increment; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8_t filter_u8[2]; + for (int i = 0; i < 2; i++) + { + filter_u8[i] = vld1_u8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); + } + for (int i = 0; i < 2; i++) + { + filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + uint8_t input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); + acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); + uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); + uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2); + uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3); + int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); + int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); + int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2)); + int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3)); + filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); + filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); + filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset)); + filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset)); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + uint8_t input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); + int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); + int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); + acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); + acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); + acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); + vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); + vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); + acc_buffer_ptr += 32; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. + // We load the first 16 bytes into filter_u8_{0,1} as usual. + // Then we load the 8 last bytes into filter_u8_x (x for 'extra'). + // This is redundant: the first 4 bytes of filter_u8_x are the same + // as the last 4 bytes of filter_u8_x. + uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); + uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); + uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4); + int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); + int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); + int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x)); + filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); + filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); + filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset)); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + uint8_t input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + acc_buffer_ptr += 20; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter = + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + uint8_t input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); + acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint16x4_t input_u16 = vdup_n_u16(0); + input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0); + input_ptr += input_ptr_increment; + input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + if (num_output_pixels <= 0) + { + return; + } + + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + + // Handle one output pixel at a time until second to the last pixel. Second + // to the last because we read eight input pixels while only processing + // four. + for (; outp < num_output_pixels - 1; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle the last output pixel. + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 12, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + // Load the filters, add filter_offset. + uint8x8_t filter_u8_0 = vld1_u8(filter_ptr); + uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4); + int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); + int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); + filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset)); + filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset)); + int16x4_t filter_0 = vget_low_s16(filter_s16_0); + int16x4_t filter_1 = vget_high_s16(filter_s16_0); + int16x4_t filter_2 = vget_high_s16(filter_s16_1); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + uint8x8_t input_u8_0 = vld1_u8(input_ptr); + uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4); + input_ptr += input_ptr_increment; + int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); + int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + + // Multiply-accumulate + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); + acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); + + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + + acc_buffer_ptr += 12; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, + int input_width, const uint8_t *input_data, + int16_t input_offset, int pad_width, int depth_multiplier, + int filter_width, const uint8_t *filter_data, + int16_t filter_offset, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, int32_t *acc_buffer) +{ + // Sanity check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + assert(stride == 1 || kAllowStrided); + if (kFixedInputDepth) + { + assert(input_depth == kFixedInputDepth); + } + if (kFixedDepthMultiplier) + { + assert(depth_multiplier == kFixedDepthMultiplier); + } + assert(output_depth == input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const uint8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclampled = 0; + int out_x_loop_end_unclampled = 0; + if (kAllowStrided) + { + if (stride == 2) + { + out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2; + out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2; + } + else if (stride == 4) + { + out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4; + out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4; + } + else + { + out_x_loop_start_unclampled = + (pad_width - dilation_factor * filter_x + stride - 1) / stride; + out_x_loop_end_unclampled = + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + } + } + else + { + out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x; + out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled); + const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const uint8_t *input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. +inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, + int input_width, const uint8_t *input_data, + int16_t input_offset, int pad_width, + int depth_multiplier, int filter_width, + const uint8_t *filter_data, int16_t filter_offset, + int out_x_buffer_start, int out_x_buffer_end, + int output_depth, int32_t *acc_buffer) +{ + const uint8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int out_x_loop_start = std::max( + out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const uint8_t *input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) + { + const uint8_t *filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) + { + const int16_t input_val = *input_ptr++ + input_offset; + for (int m = 0; m < depth_multiplier; m++) + { + const int16_t filter_val = *filter_ptr++ + filter_offset; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const int32_t *bias_data, int32_t *acc_buffer) +{ + int i = 0; +#ifdef USE_NEON + if (output_depth == 1) + { + const int32x4_t b = vdupq_n_s32(bias_data[0]); + for (; i <= num_output_pixels - 16; i += 16) + { + vst1q_s32(acc_buffer + i + 0, b); + vst1q_s32(acc_buffer + i + 4, b); + vst1q_s32(acc_buffer + i + 8, b); + vst1q_s32(acc_buffer + i + 12, b); + } + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + i, b); + } + } + else if (output_depth == 2) + { + int32x4_t b = vdupq_n_s32(bias_data[0]); + b = vsetq_lane_s32(bias_data[1], b, 1); + b = vsetq_lane_s32(bias_data[1], b, 3); + for (; i <= num_output_pixels - 8; i += 8) + { + vst1q_s32(acc_buffer + 2 * i + 0, b); + vst1q_s32(acc_buffer + 2 * i + 4, b); + vst1q_s32(acc_buffer + 2 * i + 8, b); + vst1q_s32(acc_buffer + 2 * i + 12, b); + } + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 2 * i, b); + } + } + else if (output_depth == 4) + { + const int32x4_t b = vld1q_s32(bias_data); + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + 4 * i + 0, b); + vst1q_s32(acc_buffer + 4 * i + 4, b); + vst1q_s32(acc_buffer + 4 * i + 8, b); + vst1q_s32(acc_buffer + 4 * i + 12, b); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 4 * i, b); + } + } + else if (output_depth == 8) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + vst1q_s32(acc_buffer + 8 * i + 8, b0); + vst1q_s32(acc_buffer + 8 * i + 12, b1); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + } + } + else if (output_depth == 16) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + const int32x4_t b2 = vld1q_s32(bias_data + 8); + const int32x4_t b3 = vld1q_s32(bias_data + 12); + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 16 * i + 0, b0); + vst1q_s32(acc_buffer + 16 * i + 4, b1); + vst1q_s32(acc_buffer + 16 * i + 8, b2); + vst1q_s32(acc_buffer + 16 * i + 12, b3); + } + } +#endif + for (; i < num_output_pixels; i++) + { + memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth); + } +} + +inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data) +{ + (void)bias_shape; + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); +#ifdef USE_NEON + const bool shift_left = (output_shift > 0); + const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1; +#endif + + static const int kAccBufferMaxSize = 2048; + int32_t acc_buffer[kAccBufferMaxSize]; + assert(kAccBufferMaxSize >= output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); + assert(kAccBufferActualSize <= kAccBufferMaxSize); + assert(kOutputPixelsInAccBuffer >= 1); + UNUSED_RELEASE(kAccBufferActualSize); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); + row_accum_func_t row_accum_func = nullptr; + +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \ + if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ + (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ + { \ + row_accum_func = \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + // We go over our list of kernels by decreasing order of preference + // for the cases where multiple kernels could apply. + + // Start with the fastest kernels: AllowStrided=false, fixed input depth. + + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) + + // Next come the strided kernels: AllowStrided=true, fixed input depth. + // They are a bit less efficient, but allow stride!=1. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) + + // Finally, the kernels allowing a variable input depth, + // these are the least efficient but most general kernels. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) +#endif // USE_NEON + + // No matching fast kernel found, use slow fallback. + if (!row_accum_func) + { + row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; + } + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + + // Now that we have determined row_accum_func, we can start work. + uint8_t *output_ptr = output_data; + for (int b = 0; b < batches; ++b) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + const int filter_y_end = + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) + { + const int out_x_buffer_end = + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func(stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + input_offset, pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, filter_offset, + out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating int32 values. Now need to convert them to + // the final 8bit form and store them. + const int num_output_values = output_depth * num_output_pixels; + int i = 0; +#ifdef USE_NEON + using gemmlowp::RoundingDivideByPOT; + const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); + const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min); + const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max); + // Handle 16 values at once. + // This allows us to issue 4 mutually independent int32 + // multiplications (vqrdmulh), which should alleviate most of their + // high latency. + for (; i <= num_output_values - 16; i += 16) + { + int32x4_t acc[4]; + for (int j = 0; j < 4; j++) + { + acc[j] = vld1q_s32(acc_buffer + i + 4 * j); + } + + if (!shift_left) + { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) + { + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } + for (int j = 0; j < 4; j++) + { + acc[j] = RoundingDivideByPOT(acc[j], -output_shift); + } + } + else + { + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) + { + acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two); + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } + } + // Add the output offset. + for (int j = 0; j < 4; j++) + { + acc[j] = vaddq_s32(acc[j], output_offset_vec); + } + // Apply the activation function. + for (int j = 0; j < 4; j++) + { + acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); + } + for (int j = 0; j < 4; j++) + { + acc[j] = vminq_s32(acc[j], output_activation_max_vec); + } + // Saturating cast to uint8_t and store to destination. + int16x4_t acc_s16[4]; + for (int j = 0; j < 4; j++) + { + acc_s16[j] = vqmovn_s32(acc[j]); + } + const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); + const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); + const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); + const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); + vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); + output_ptr += 16; + } + // Handle 8 values at once. + // Not as good as 16 (now we're only issuing 2 mutually independent + // vqrdmulh instructions, so we're probably paying for their high + // latency). + for (; i <= num_output_values - 8; i += 8) + { + int32x4_t acc0 = vld1q_s32(acc_buffer + i); + int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); + if (!shift_left) + { + // Fixed-point multiplication. + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + // Rounding right shift. + acc0 = RoundingDivideByPOT(acc0, -output_shift); + acc1 = RoundingDivideByPOT(acc1, -output_shift); + } + else + { + // Fixed-point multiplication. + acc0 = vmulq_n_s32(acc0, multiplier_power_of_two); + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + + acc1 = vmulq_n_s32(acc1, multiplier_power_of_two); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + } + // Add the output offset. + acc0 = vaddq_s32(acc0, output_offset_vec); + acc1 = vaddq_s32(acc1, output_offset_vec); + // Apply the activation function. + acc0 = vmaxq_s32(acc0, output_activation_min_vec); + acc1 = vmaxq_s32(acc1, output_activation_min_vec); + acc0 = vminq_s32(acc0, output_activation_max_vec); + acc1 = vminq_s32(acc1, output_activation_max_vec); + // Saturating cast to uint8_t and store to destination. + const int16x4_t acc0_s16 = vqmovn_s32(acc0); + const int16x4_t acc1_s16 = vqmovn_s32(acc1); + const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(output_ptr, res_u8); + output_ptr += 8; + } + // Handle 4 values at once. Now we're paying the full price of the + // high latency of vqrdmulh. Also, storing only 4 bytes at the end + // (without any alignment) can only be done 1 byte at a time. + // Yet, that is still worth doing to minimize the amount of leftover + // that will have to go through the very slow scalar code. + for (; i <= num_output_values - 4; i += 4) + { + int32x4_t acc = vld1q_s32(acc_buffer + i); + if (!shift_left) + { + // Fixed-point multiplication. + acc = vqrdmulhq_n_s32(acc, output_multiplier); + // Rounding right shift. + acc = RoundingDivideByPOT(acc, -output_shift); + } + else + { + // Fixed-point multiplication. + acc = vmulq_n_s32(acc, multiplier_power_of_two); + acc = vqrdmulhq_n_s32(acc, output_multiplier); + } + // Add the output offset. + acc = vaddq_s32(acc, output_offset_vec); + // Apply the activation function. + acc = vmaxq_s32(acc, output_activation_min_vec); + acc = vminq_s32(acc, output_activation_max_vec); + // Saturating cast to uint8_t and store to destination. + const int16x4_t acc_s16 = vqmovn_s32(acc); + const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_lane_u8(output_ptr + 0, res_u8, 0); + vst1_lane_u8(output_ptr + 1, res_u8, 1); + vst1_lane_u8(output_ptr + 2, res_u8, 2); + vst1_lane_u8(output_ptr + 3, res_u8, 3); + output_ptr += 4; + } +#endif // USE_NEON + + // Handle leftover values, one by one. This is very slow. + for (; i < num_output_values; i++) + { + int32_t acc = acc_buffer[i]; + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + *output_ptr++ = static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__ diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h new file mode 100644 index 000000000..ae1f9e78e --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__ +#define __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" + +#include <stdexcept> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +template <typename T> +inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, int b, int kheight, + int kwidth, int stride_width, int stride_height, + int pad_width, int pad_height, int in_width, int in_height, + int in_depth, int single_buffer_length, int buffer_id, + const T *in_data, T *conv_buffer_data, uint8_t zero_byte) +{ + assert(input_shape.DimensionsCount() == 4); + // This chunk of code reshapes all the inputs corresponding to + // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). + const int kwidth_times_indepth = kwidth * in_depth; + const int inwidth_times_indepth = in_width * in_depth; + const int ih_ungated_start = h * stride_height - pad_height; + const int ih_ungated_end = (ih_ungated_start + kheight); + const int ih_end = std::min(ih_ungated_end, in_height); + const int iw_ungated_start = w * stride_width - pad_width; + const int iw_ungated_end = (iw_ungated_start + kwidth); + const int iw_end = std::min(iw_ungated_end, in_width); + // If the patch is off the edge of the input image, skip writing those rows + // and columns from the patch into the output array. + const int h_offset = std::max(0, -ih_ungated_start); + const int w_offset = std::max(0, -iw_ungated_start); + const int ih_start = std::max(0, ih_ungated_start); + const int iw_start = std::max(0, iw_ungated_start); + const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth; + const int output_row_offset = (buffer_id * single_buffer_length); + int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth; + int in_offset = Offset(input_shape, b, ih_start, iw_start, 0); + + // Express all of the calculations as padding around the input patch. + const int top_padding = h_offset; + const int bottom_padding = (ih_ungated_end - ih_end); + const int left_padding = w_offset; + const int right_padding = (iw_ungated_end - iw_end); + assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth)); + + // Write out zeroes to the elements representing the top rows of the input + // patch that are off the edge of the input image. + if (top_padding > 0) + { + const int top_row_elements = (top_padding * kwidth * in_depth); + memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T))); + } + + // If the patch is on the interior of the input image horizontally, just copy + // over the rows sequentially, otherwise add zero padding at the start or end. + if ((left_padding == 0) && (right_padding == 0)) + { + for (int ih = ih_start; ih < ih_end; ++ih) + { + memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T)); + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } + else + { + for (int ih = ih_start; ih < ih_end; ++ih) + { + if (left_padding > 0) + { + const int left_start = (out_offset - (left_padding * in_depth)); + memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T))); + } + memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T)); + if (right_padding > 0) + { + const int right_start = (out_offset + single_row_num); + memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T))); + } + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } + + // If the bottom of the patch falls off the input image, pad the values + // representing those input rows with zeroes. + if (bottom_padding > 0) + { + const int bottom_row_elements = (bottom_padding * kwidth * in_depth); + const int bottom_start = + output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); + memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T))); + } +} + +// Supports per-batch zero_byte for per-batch asymmetric quantized inputs. +template <typename T> +void DilatedIm2col(const ConvParams ¶ms, const Shape &input_shape, const T *input_data, + const Shape &filter_shape, const Shape &output_shape, T *im2col_data, + const int32_t *zero_bytes, const int zero_bytes_len) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + // For dilated convolution, the input pixels are not contiguous therefore we + // can't use the same optimizations as Im2Col(). Though note this code would + // work fine for the non-dilated case too (though likely a bit slower). + assert(dilation_width_factor != 1 || dilation_height_factor != 1); + assert(im2col_data); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + MatchingDim(output_shape, 3, filter_shape, 0); + + // Construct the MxN sized im2col matrix. + // The rows M, are sub-ordered B x H x W + const Shape row_shape({1, batches, output_height, output_width}); + // The columns, N, are sub-ordered Kh x Kw x Din + const Shape col_shape({1, filter_height, filter_width, input_depth}); + // Use dimensions M and N to construct dims for indexing directly into im2col + const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()}); + + // Loop through the output rows (B x H x W) + for (int batch = 0; batch < batches; ++batch) + { + const T zero_byte = + zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + // Each im2col row is an output pixel. Arrange the input data in this + // row in an order we can conveniently multiply with the filter data. + int row_offset = Offset(row_shape, 0, batch, out_y, out_x); + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + // Loop through all the pixels of the filter (Kh x Kw) + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + if ((in_y >= 0) && (in_y < input_height)) + { + // Filter row is within the input data. + // Loop through all the filter pixels in this row. + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0); + T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset); + if ((in_x >= 0) && (in_x < input_width)) + { + // Filter pixel is within the input, copy the input data. + T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0); + memcpy(dst, src, input_depth * sizeof(T)); + } + else + { + // Filter pixel is outside the input, zero it out. + memset(dst, zero_byte, input_depth * sizeof(T)); + } + } + } + else + { + // Filter row is outside the input, zero out the entire filter row. + int col_offset = Offset(col_shape, 0, filter_y, 0, 0); + T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset); + memset(dst, zero_byte, filter_width * input_depth * sizeof(T)); + } + } + } + } + } +} + +template <typename T> +void DilatedIm2col(const ConvParams ¶ms, uint8_t zero_byte, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const Shape &output_shape, + T *im2col_data) +{ + const int32_t zero_point = static_cast<int32_t>(zero_byte); + DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data, + &zero_point, 1); +} + +template <typename T> +void Im2col(const ConvParams ¶ms, int kheight, int kwidth, uint8_t zero_byte, + const Shape &input_shape, const T *input_data, const Shape &output_shape, + T *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + + int buffer_id = 0; + // Loop over the output nodes. + for (int b = 0; b < batches; ++b) + { + for (int h = 0; h < output_height; ++h) + { + for (int w = 0; w < output_width; ++w) + { + ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width, + stride_height, pad_width, pad_height, input_width, + input_height, input_depth, output_depth, buffer_id, input_data, + output_data, zero_byte); + ++buffer_id; + } + } + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_OPTIMIZED_UTILS_H__ diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h new file mode 100644 index 000000000..e8ffd4014 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__ +#define __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Shape &rhs_shape, + const float *rhs_data, const Shape &, float *output_data) +{ + const Shape extended_lhs_shape = Shape::ExtendedShape(5, lhs_shape); + const Shape extended_rhs_shape = Shape::ExtendedShape(5, rhs_shape); + + // Determine which dimension is the broadcast dimension. + auto broadcast_dim = [](int lhs_dim, int rhs_dim) { + if (lhs_dim == rhs_dim) + return lhs_dim; + if (lhs_dim == 1) + return rhs_dim; + assert(rhs_dim == 1); + return lhs_dim; + }; + + // Compute the "extent" for iterating on this dimension. + // If we are broadcasting, then don't advance (i.e return 0). + auto extent = [](const Shape &shape, int x) { + if (shape.Dims(x) == 1) + { + return 0; + } + int prod = 1; + for (int i = x + 1; i < shape.DimensionsCount(); ++i) + { + prod *= shape.Dims(i); + } + return prod; + }; + + const int batch_dim0 = broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0)); + const int batch_dim1 = broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1)); + const int batch_dim2 = broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2)); + + const int lhs_ext0 = extent(extended_lhs_shape, 0); + const int lhs_ext1 = extent(extended_lhs_shape, 1); + const int lhs_ext2 = extent(extended_lhs_shape, 2); + const int rhs_ext0 = extent(extended_rhs_shape, 0); + const int rhs_ext1 = extent(extended_rhs_shape, 1); + const int rhs_ext2 = extent(extended_rhs_shape, 2); + + // Set params for each matrix multiply. + const int lhs_rows = extended_lhs_shape.Dims(3); + const int rhs_cols = extended_rhs_shape.Dims(4); + const int accum_depth = extended_lhs_shape.Dims(4); + + for (int b0 = 0; b0 < batch_dim0; ++b0) + { + const float *lhs_ptr0 = lhs_data + (b0 * lhs_ext0); + const float *rhs_ptr0 = rhs_data + (b0 * rhs_ext0); + for (int b1 = 0; b1 < batch_dim1; ++b1) + { + const float *lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; + const float *rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; + for (int b2 = 0; b2 < batch_dim2; ++b2) + { + const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; + const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; + float *out_ptr = + output_data + + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; + for (int j = 0; j < rhs_cols; ++j) + { + for (int i = 0; i < lhs_rows; ++i) + { + float total = 0.f; + for (int k = 0; k < accum_depth; ++k) + { + total += lhs_ptr2[accum_depth * i + k] * rhs_ptr2[j * accum_depth + k]; + } + int idx = lhs_rows * j + i; + out_ptr[idx] = total; + } + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_BATCH_MATMUL_H__ diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h new file mode 100644 index 000000000..f7e39248c --- /dev/null +++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__ +#define __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +template <typename T> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + const int32_t flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + output_data[i] = ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.quantized_activation_min, + params.quantized_activation_max); + } +} + +template <> +inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data, + const std::function<float(const float &, const float &)> &fn) +{ + const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + for (int i = 0; i < size; i++) + { + output_data[i] = + ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), + params.float_activation_min, params.float_activation_max); + } +} + +template <typename T> +inline void BroadcastBinaryArithmeticOpSlowQuant8( + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255)) + { + throw std::runtime_error{"Support only for Quant8."}; + } + + // Comment from tensorflow lite: + // + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax<uint8_t>( + fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); + } + } + } + } +} +template <typename T> +inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶ms, + const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data, + const std::function<T(const T &, const T &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + // Comment from tensorflow lite: + // + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); + } + } + } + } +} + +template <> +inline void BroadcastBinaryArithmeticOpSlow( + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, const Shape &output_shape, + float *output_data, const std::function<float(const float &, const float &)> &fn) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) + { + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.float_activation_min, params.float_activation_max); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__ diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h new file mode 100644 index 000000000..86e8b5143 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/Conv.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_CONV_H__ +#define __NNFW_CKER_REFERENCE_CONV_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ +namespace reference +{ + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + const int in_offset = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + float input_value = input_data[in_offset + in_channel]; + float filter_value = filter_data[filter_offset + in_channel]; + total += (input_value * filter_value); + } + } + } + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + ActivationFunctionWithMinMax(total + bias_value, output_activation_min, + output_activation_max); + } + } + } + } +} + +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + assert(output_activation_min <= output_activation_max); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + UNUSED_RELEASE(bias_shape); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + const int in_base = Offset(input_shape, batch, in_y, in_x, 0); + const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0); + for (int in_channel = 0; in_channel < input_depth; in_channel++) + { + int32_t input_val = input_data[in_channel + in_base]; + int32_t filter_val = filter_data[in_channel + filter_base]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + } + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast<uint8_t>(acc); + } + } + } + } +} + +} // namespace reference +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_CONV_H__ diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h new file mode 100644 index 000000000..7b4ff2040 --- /dev/null +++ b/compute/cker/include/cker/ruy/RuySupport.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RUY_RUY_SUPPORT_H__ +#define __NNFW_CKER_RUY_RUY_SUPPORT_H__ + +#include <util/ConfigSource.h> +#include <ruy/matrix.h> +#include <ruy/ruy.h> +#include <cassert> +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ +namespace ruy_support +{ + +inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy) +{ + switch (cache_policy) + { + case CachePolicy::kNeverCache: + return ruy::CachePolicy::kNeverCache; + case CachePolicy::kCacheIfLargeSpeedup: + return ruy::CachePolicy::kCacheIfLargeSpeedup; + case CachePolicy::kAlwaysCache: + return ruy::CachePolicy::kAlwaysCache; + default: + assert(false); + return ruy::CachePolicy::kNeverCache; + } +} + +template <typename Scalar, typename DataPointer> +void MakeRuyMatrix(const MatrixParams<Scalar> ¶ms, DataPointer data_ptr, + ruy::Matrix<Scalar> *dst, bool use_caching = false) +{ + ruy::Order ruy_order = + params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor; + ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout()); + // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer. + // It does care whether we assign to it a Scalar* or a const Scalar*. + dst->set_data(data_ptr); + dst->set_zero_point(params.zero_point); + if (use_caching) + { + dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy)); + } +} + +template <typename GemmParamsType, typename RuySpecType> +void MakeRuyMulParams(const GemmParamsType ¶ms, RuySpecType *ruy_mul_params) +{ + // This validation has already been performed by the Gemm API entry point, + // but it doesn't hurt to test specifically this again here, where it's + // being used. + ValidateGemmParams(params); + + ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint); + ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent); + ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel); + ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel); + ruy_mul_params->set_bias(params.bias); + ruy_mul_params->set_clamp_min(params.clamp_min); + ruy_mul_params->set_clamp_max(params.clamp_max); +} + +} // namespace ruy_support +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RUY_RUY_SUPPORT_H__ diff --git a/compute/test/CMakeLists.txt b/compute/test/CMakeLists.txt new file mode 100644 index 000000000..92aac3e72 --- /dev/null +++ b/compute/test/CMakeLists.txt @@ -0,0 +1,17 @@ +if(NOT ENABLE_TEST) + return() +endif(NOT ENABLE_TEST) + +set(TEST_COMPUTE test_compute) + +file(GLOB_RECURSE TESTS "*.cc") + +add_executable(${TEST_COMPUTE} ${TESTS}) + +target_link_libraries(${TEST_COMPUTE} nnfw_lib_cker) +target_link_libraries(${TEST_COMPUTE} gtest) +target_link_libraries(${TEST_COMPUTE} gtest_main) +target_link_libraries(${TEST_COMPUTE} ${LIB_PTHREAD} dl) +add_test(${TEST_COMPUTE} ${TEST_COMPUTE}) + +install(TARGETS ${TEST_COMPUTE} DESTINATION unittest_standalone) diff --git a/compute/test/cker/Range.cc b/compute/test/cker/Range.cc new file mode 100644 index 000000000..55f4fcf20 --- /dev/null +++ b/compute/test/cker/Range.cc @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cker/operation/Range.h> + +#include <gtest/gtest.h> +#include <vector> + +TEST(CKer_Operation, Range) +{ + { + const int start = 0; + const int limit = 10; + const int delta = 1; + std::vector<int> actual(10); + nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_EQ(actual[i], i); + } + + { + const int start = 3; + const int limit = 18; + const int delta = 3; + std::vector<int> expected = {3, 6, 9, 12, 15}; + std::vector<int> actual(expected.size()); + nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_EQ(actual[i], expected[i]); + } + + { + const float start = 3; + const float limit = 1; + const float delta = -0.5; + std::vector<float> expected = { + 3, 2.5, 2, 1.5, + }; + std::vector<float> actual(expected.size()); + nnfw::cker::Range<float>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_FLOAT_EQ(actual[i], expected[i]); + } +} + +TEST(CKer_Operation, neg_Range) +{ + { + const int start = 212; + const int limit = 10; + const int delta = 1; + std::vector<int> actual(10); + + EXPECT_ANY_THROW(nnfw::cker::Range<int>(&start, &limit, &delta, actual.data())); + } +} |