diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:04 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-12-14 14:43:04 +0900 |
commit | 12d88feea8573f8490629cf62fc342b152e57d65 (patch) | |
tree | 3c734cc4d629834d2d523f4575ef84cd64684e57 /compute/ARMComputeEx | |
parent | d6b371e095d737922187a518b8faba1ef6f3a2b1 (diff) | |
download | nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.gz nnfw-12d88feea8573f8490629cf62fc342b152e57d65.tar.bz2 nnfw-12d88feea8573f8490629cf62fc342b152e57d65.zip |
Imported Upstream version 1.11.0upstream/1.11.0
Diffstat (limited to 'compute/ARMComputeEx')
141 files changed, 22943 insertions, 0 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt new file mode 100644 index 000000000..58f558db2 --- /dev/null +++ b/compute/ARMComputeEx/CMakeLists.txt @@ -0,0 +1,36 @@ +nnfw_find_package(ARMCompute QUIET) + +if(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: need ARM Compute library") + return() +else(NOT ARMCompute_FOUND) + message(STATUS "Check ARM Compute library extension build: OK") +endif(NOT ARMCompute_FOUND) + +set(ACL_EX_BASE ${CMAKE_CURRENT_SOURCE_DIR}) + +file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") + +# generate embeded cl_kernel +execute_process ( + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + COMMAND bash -c "python resolve_includes.py" +) + +add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) +target_include_directories(arm_compute_ex PUBLIC ${ACL_EX_BASE}) +target_link_libraries(arm_compute_ex PRIVATE arm_compute) +target_link_libraries(arm_compute_ex PRIVATE nnfw_common) +target_link_libraries(arm_compute_ex PRIVATE nnfw_coverage) +# Defines to enable validate check in debug build +target_compile_definitions(arm_compute_ex PRIVATE EMBEDDED_KERNELS + $<$<CONFIG:Debug>:ARM_COMPUTE_DEBUG_ENABLED ARM_COMPUTE_ASSERTS_ENABLED + ARM_COMPUTE_LOGGING_ENABLED>) +# Validate check functions are not used on release build +# Some parameter are used for validate check function call, and these parameter may not used on release build +# Because clang requires to add "-Wno-unused-parameter -Wno-unused-function" after "-Wall", +# this should be after linking nnfw_common and use interface lib linking +add_library(ignore_unused_warning INTERFACE) +target_compile_options(ignore_unused_warning INTERFACE -Wno-unused-parameter -Wno-unused-function) +target_link_libraries(arm_compute_ex PRIVATE $<$<NOT:$<CONFIG:Debug>>:ignore_unused_warning>) +install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h new file mode 100644 index 000000000..d29886a9d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLKernelLibraryEx.h + * @ingroup COM_AI_RUNTIME + * @brief This file is a cloned version of CLKernelLibrary.h in ACL. This file defines + * an interface for CLKernelLibrary.cpp which adds more OpenCL kernels on top of ACL. + */ + +#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ +#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <map> +#include <set> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** + * @brief Class to build OpenCL kernels added from nnfw + * */ +class CLKernelLibraryEx +{ + using StringSet = std::set<std::string>; + +private: + /** + * @brief Construct a new CLKernelLibraryEx object + */ + CLKernelLibraryEx(); + +public: + /** + * @brief Prevent instances of this class from being copied. + */ + CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; + + /** + * @brief Prevent instances of this class from being copied. + */ + const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; + + /** + * @brief Get the KernelLibrary singleton. + * @return The KernelLibrary instance + */ + static CLKernelLibraryEx &get(); + + /** + * @brief Initialise the kernel library. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @param[in] context CL context used to create programs. + * @param[in] device CL device for which the programs are created. + * @return N/A + */ + void init(std::string kernel_path, cl::Context context, cl::Device device) + { + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); + } + + /** + * @brief Set the path that the kernels reside in. + * @param[in] kernel_path Path of the directory from which kernel sources are loaded. + * @return N/A + */ + void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; + + /** + * @brief Get the path that the kernels reside in. + * @return the path of kernel files + */ + std::string get_kernel_path() { return _kernel_path; }; + + /** + * @brief Get the source of the selected program. + * @param[in] program_name Program name. + * @return Source of the selected program. + */ + std::string get_program_source(const std::string &program_name); + + /** + * @brief Set the CL context used to create programs. + * @note Setting the context also resets the device to the + * first one available in the new context. + * @param[in] context A CL context. + * @return N/A + */ + void set_context(cl::Context context) + { + _context = std::move(context); + if (_context.get() == nullptr) + { + _device = cl::Device(); + } + else + { + const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); + + if (cl_devices.empty()) + { + _device = cl::Device(); + } + else + { + _device = cl_devices[0]; + } + } + } + + /** + * @brief Return associated CL context. + * @return A CL context. + */ + cl::Context &context() { return _context; } + + /** + * @brief Set the CL device for which the programs are created. + * @param[in] device A CL device. + * @return N/A + */ + void set_device(cl::Device device) { _device = std::move(device); } + + /** + * @brief Gets the CL device for which the programs are created. + * @return A CL device. + */ + cl::Device &get_device() { return _device; } + + /** + * @brief Return the device version + * @return The content of CL_DEVICE_VERSION + */ + std::string get_device_version(); + + /** + * @brief Create a kernel from the kernel library. + * @param[in] kernel_name Kernel name. + * @param[in] build_options_set Kernel build options as a set. + * @return The created kernel. + */ + Kernel create_kernel(const std::string &kernel_name, + const StringSet &build_options_set = {}) const; + + /** + * @brief Find the maximum number of local work items in a workgroup can be supported for the + * kernel. + * @param[in] kernel kernel object + */ + + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** + * @brief Return the default NDRange for the device. + * @return default NDRangeof the device + */ + cl::NDRange default_ndrange() const; + + /** + * @brief Clear the library's cache of binary programs + * @return N/A + */ + void clear_programs_cache() + { + _programs_map.clear(); + _built_programs_map.clear(); + } + + /** + * @brief Access the cache of built OpenCL programs + * @return program map data structure of which key is name of kernel and value is + * kerel source name. (*.cl) + */ + const std::map<std::string, cl::Program> &get_built_programs() const + { + return _built_programs_map; + } + + /** + * @brief Add a new built program to the cache + * @param[in] built_program_name Name of the program + * @param[in] program Built program to add to the cache + * @return N/A + */ + void add_built_program(const std::string &built_program_name, cl::Program program); + + /** + * @brief Returns true if FP16 is supported by the CL device + * @return true if the CL device supports FP16 + */ + bool fp16_supported() const; + + /** + * @brief Returns true if int64_base_atomics extension is supported by the CL device + * @return true if the CL device supports int64_base_atomics extension + */ + bool int64_base_atomics_supported() const; + +private: + /** + * @brief Load program and its dependencies. + * @param[in] program_name Name of the program to load. + */ + const Program &load_program(const std::string &program_name) const; + /** + * @brief Concatenates contents of a set into a single string. + * @param[in] s Input set to concatenate. + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + + cl::Context _context; /**< Underlying CL context. */ + cl::Device _device; /**< Underlying CL device. */ + std::string _kernel_path; /**< Path to the kernels folder. */ + mutable std::map<std::string, const Program> + _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, cl::Program> + _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} +#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h new file mode 100644 index 000000000..a0aa0560b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H +#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the reduction operation kernel + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerKernelEx : public ICLKernel +{ +public: + /** Default constructor */ + CLArgMinMaxLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default; + /** Default destructor */ + ~CLArgMinMaxLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor of the previous iterations of @ref + * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[out] output Destination tensor. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + */ + void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, + unsigned int axis, ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerKernelEx. + * + * @param[in] input Source tensor info. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor info of the previous iterations. Data types + * supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[in] output Destination tensor info. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_prev_output; + ICLTensor *_output; + unsigned int _reduction_axis; + ReductionOperation _op; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h new file mode 100644 index 000000000..bb6fcb8f5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to return truth values of two input tensors for Binary Logical Op*/ +class CLBinaryLogicalOpKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLBinaryLogicalOpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel(const CLBinaryLogicalOpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLBinaryLogicalOpKernel &operator=(const CLBinaryLogicalOpKernel &) = delete; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel(CLBinaryLogicalOpKernel &&) = default; + /** Allow instances of this class to be moved */ + CLBinaryLogicalOpKernel &operator=(CLBinaryLogicalOpKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input1 Source tensor1. + * @param[in] input2 Source tensor2. + * @param[out] output Output tensor. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h new file mode 100644 index 000000000..ed668fd9c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLCastBoolKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastBoolKernel class + */ + +#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ + +#include "arm_compute/core/CL/ICLSimple3DKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class for the kernel converting boolean type + */ +class CLCastBoolKernel : public ICLSimple3DKernel +{ +public: + /** + * @brief Initialise the kernel's input and output. + * @param[in] input Input tensor. Data types supported: U8 + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLCastBoolKernel + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h new file mode 100644 index 000000000..a614d5259 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLEmbeddingLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLEmbeddingLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform EmbeddingLookup operation with opencl kernel +*/ +class CLEmbeddingLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLEmbeddingLookupKernel object + * */ + CLEmbeddingLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLEmbeddingLookupKernel &operator=(const CLEmbeddingLookupKernel &) = delete; + + /** + * @brief Construct a CLEmbeddingLookupKernel object by using default move constructor + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLEmbeddingLookupKernel object to move + * */ + CLEmbeddingLookupKernel &operator=(CLEmbeddingLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLEmbeddingLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] input Source tensor. + * Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] lookups Lookups are 1D tensor that values are indices into the first + * dimension of input. + * Data types supported: S32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLEmbeddingLookupKernel + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] lookups Lookups info. Data types supported: S32. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + const ICLTensor *_lookups; /** Lookups tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h new file mode 100644 index 000000000..6630c7be7 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLGatherExKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLGatherExKernel class + */ + +#ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ +#define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define an interface for the gather kernel. + */ +class CLGatherExKernel : public ICLKernel +{ +public: + /** + * @brief Construct CLGatherExKernel object + * */ + CLGatherExKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel(const CLGatherExKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + */ + CLGatherExKernel &operator=(const CLGatherExKernel &) = delete; + + /** + * @brief Construct CLGatherExKernel object by using default move constructor + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel(CLGatherExKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLGatherExKernel object to move + */ + CLGatherExKernel &operator=(CLGatherExKernel &&) = default; + + /** + * @brief Initialise the kernel's input, output and border mode. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLGatherExKernel + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices Indices tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative + * values wrap around. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_indices; + ICLTensor *_output; + int _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLGATHEREXKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h new file mode 100644 index 000000000..99cfa61ec --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLHashtableLookupKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLHashtableLookupKernel class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** +* @brief Class to perform HashtableLookup operation with opencl kernel +*/ +class CLHashtableLookupKernel : public ICLKernel +{ +public: + /** + * @brief Construct a CLHashtableLookupKernel object + * */ + CLHashtableLookupKernel(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * */ + CLHashtableLookupKernel &operator=(const CLHashtableLookupKernel &) = delete; + + /** + * @brief Construct a CLHashtableLookupKernel object by using default move constructor + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel(CLHashtableLookupKernel &&) = default; + + /** + * @brief Move assignment operator + * @param[in] CLHashtableLookupKernel object to move + * */ + CLHashtableLookupKernel &operator=(CLHashtableLookupKernel &&) = default; + + /** + * @brief Destruct this object + * */ + ~CLHashtableLookupKernel() = default; + + /** + * @brief Set the input and output of the kernel + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, + ICLTensor *output, ICLTensor *hits); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLHashtableLookupKernel + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_lookups{nullptr}; /** Lookups tensor */ + const ICLTensor *_keys{nullptr}; /** Keys tensor */ + const ICLTensor *_input{nullptr}; /** Source tensor */ + ICLTensor *_output{nullptr}; /** Destination tensor */ + ICLTensor *_hits{nullptr}; /** Hits tensor */ + std::unique_ptr<CLTensor> _lookup_indices{nullptr}; /** Lookup indices tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..f57e799ad --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for performing an instance normalization */ +class CLInstanceNormalizationLayerKernelEx : public ICLKernel +{ +public: + /** Constructor */ + CLInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLInstanceNormalizationLayerKernelEx & + operator=(const CLInstanceNormalizationLayerKernelEx &) = delete; + /** Default Move Constructor. */ + CLInstanceNormalizationLayerKernelEx(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default move assignment operator */ + CLInstanceNormalizationLayerKernelEx & + operator=(CLInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~CLInstanceNormalizationLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_gamma; + ICLTensor *_beta; + float _epsilon; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h new file mode 100644 index 000000000..90e8b5705 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class CLMultiplyScaleFactorKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMultiplyScaleFactorKernel(const CLMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMultiplyScaleFactorKernel &operator=(const CLMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + CLMultiplyScaleFactorKernel(CLMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + CLMultiplyScaleFactorKernel &operator=(CLMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~CLMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + * @param[in] multiplier Additional scale value. + */ + void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @param[in] multiplier Additional scale value. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output); + + /** + * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command + * queue. + * @note The queue is *not* flushed by this method, and therefore the kernel will not have + * been executed by the time this method returns. + * @param[in] window Region on which to execute the kernel. (Must be a valid region of + * the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel.@return N/A + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_scale_factor; + ICLTensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h new file mode 100644 index 000000000..fa383c0d0 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ +#define __ARM_COMPUTE_CLNEGKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a negation operation on tensor*/ +class CLNegKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLNegKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel(const CLNegKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLNegKernel &operator=(const CLNegKernel &) = delete; + /** Allow instances of this class to be moved */ + CLNegKernel(CLNegKernel &&) = default; + /** Allow instances of this class to be moved */ + CLNegKernel &operator=(CLNegKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEGKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h new file mode 100644 index 000000000..a512057b9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#define __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +class ICLTensor; +/** Interface for the kernel to perform one-hot encoding*/ +class CLOneHotKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel(const CLOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel &operator=(const CLOneHotKernel &) = delete; + /** Allow instances of this class to be moved */ + CLOneHotKernel(CLOneHotKernel &&) = default; + /** Allow instances of this class to be moved */ + CLOneHotKernel &operator=(CLOneHotKernel &&) = default; + /** Default destructor */ + ~CLOneHotKernel() = default; + /** Initialise the kernel's inputs and output + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and output already initialized to off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel without off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + /** Initialise the kernel's inputs and outputs internally + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + int depth, int axis); + +private: + const ICLTensor *_indices; /**< Indices tensor */ + const ICLTensor *_on_value; /**< On value tensor */ + const ICLTensor *_off_value; /**< Off value tensor */ + ICLTensor *_output; /**< Destination tensor */ + bool _is_off_value_memset; /**< Whether off_value is zero */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h new file mode 100644 index 000000000..4e1b56cba --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the quantization layer kernel. + * + * @note The implementation supports only 2D input tensors. + */ +class CLQuantizationSymmetricKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQuantizationSymmetricKernel(const CLQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLQuantizationSymmetricKernel &operator=(const CLQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + CLQuantizationSymmetricKernel(CLQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + CLQuantizationSymmetricKernel &operator=(CLQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~CLQuantizationSymmetricKernel() = default; + /** Set the input, output. + * + * @param[in] input Source tensor. Data types supported: F32/F16. + * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @param[out] output Destination tensor with the same dimensions of input. Data types supported: + * S8. + * + * @note Output auto initialization is not supported by this kernel + */ + void configure(const ICLTensor *input, const ICLTensor *scale_factor, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F32/F16. + * @param[in] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @param[in] output Destination tensor info with the same dimensions of input. Data types + * supported: S8. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_scale_factor; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h new file mode 100644 index 000000000..4f9042e41 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLReduceOperationKernel.h + * @brief This file defines CLReduceOperationKernel class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define interface for the reduce operation kernel + */ +class CLReduceOperationKernel : public ICLKernel +{ +public: + /** + * @brief Default constructor + */ + CLReduceOperationKernel(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel(const CLReduceOperationKernel &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLReduceOperationKernel &operator=(const CLReduceOperationKernel &) = delete; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel(CLReduceOperationKernel &&) = default; + /** + * @brief Allow instances of this class to be moved + */ + CLReduceOperationKernel &operator=(CLReduceOperationKernel &&) = default; + /** + * @brief Default destructor + */ + ~CLReduceOperationKernel() = default; + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, + ReductionOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperationKernel. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReductionOperation op); + + /* + * @brief Run CLReduceOperationKernel op + * @param[in] window Window to be used for in_slice + * @param[in] queue CLQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + uint32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h new file mode 100644 index 000000000..4d4478ece --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ +#define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the kernel to perform min max search on a 3D tensor. + */ +class CLScaleFactorSymm8Kernel : public ICLKernel +{ +public: + /** Default constructor */ + CLScaleFactorSymm8Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLScaleFactorSymm8Kernel(const CLScaleFactorSymm8Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLScaleFactorSymm8Kernel &operator=(const CLScaleFactorSymm8Kernel &) = delete; + /** Allow instances of this class to be moved */ + CLScaleFactorSymm8Kernel(CLScaleFactorSymm8Kernel &&) = default; + /** Allow instances of this class to be moved */ + CLScaleFactorSymm8Kernel &operator=(CLScaleFactorSymm8Kernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor with 2 dimensions. The first dimension will be interpreted as + * batches. Data types supported: F32. + * @param[out] output Output tensor with shape [batches] which stores the scale values for each 2D + * input tensor. + * The dimensions over the first must match the batched dimensions of the input + * tensor. Data types supported: F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLScaleFactorSymm8Kernel + * + * @param[in] input Input tensor info. Data types supported: F32. + * @param[in] output Output tensor info with shape [batches] which stores the scale values for + * each 2D input tensor. + * The dimensions over the first must match the batched dimensions of the input + * tensor. Data types supported: F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + /** Resets global minimum and maximum + * + * @param[in,out] queue Command queue on which to map and unmap the min_max tensor + */ + void reset(cl::CommandQueue &queue); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h new file mode 100644 index 000000000..aa4a14812 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLTopKV2Kernel.h + * @brief This file defines classes for TopKV2Kernel + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +// these parameters can be changed +#define _ITEMS 16 // number of items in a group +#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS +#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram +#define PERMUT // store the final permutation +//////////////////////////////////////////////////////// + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to define CLTopKV2Single + */ +class CLTopKV2Single : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Single(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + */ + CLTopKV2Single(const CLTopKV2Single &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Single to be copied + * @return Reference of this instance + */ + CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + */ + CLTopKV2Single(CLTopKV2Single &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Single to be moved + * @return Reference of this instance + */ + CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] topk_values Values of the top k predictions + * @param[in] topk_indices Indices of the top k predictions + * @param[in] indices Indices + * @param[in] temp_stack Temp stack + * @param[in] k K of the top k predictions + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); + + /* + * @brief Run CLTopKV2Single op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_topk_values; + ICLTensor *_topk_indices; +}; + +/** + * @brief Class to define CLTopKV2Init + */ +class CLTopKV2Init : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Init(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + */ + CLTopKV2Init(const CLTopKV2Init &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Init to be copied + * @return Reference of this instance + */ + CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + */ + CLTopKV2Init(CLTopKV2Init &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Init to be moved + * @return Reference of this instance + */ + CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + + /** + * @brief Initialise kernel with params + * @param[in] input An input tensor + * @param[in] in_key_buf Buffer of input key + * @param[in] in_ind_buf Buffer of input index + * @param[in] n Number times to quick-sort + * return N/A + */ + void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); + + /* + * @brief Run CLTopKV2Init op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; +}; + +/** + * @brief Class to define CLRadixSortHistogram + */ +class CLRadixSortHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + */ + CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + */ + CLRadixSortHistogram(CLRadixSortHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + } + + /* + * @brief Run CLRadixSortHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; +}; + +/** + * @brief Class to define CLRadixSortScanHistogram + */ +class CLRadixSortScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + */ + CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + */ + CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortGlobalScanHistogram + */ +class CLRadixSortGlobalScanHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortGlobalScanHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + */ + CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortGlobalScanHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + */ + CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortGlobalScanHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] glob_sum_buf Buffer of global sum + * @param[out] temp_buf Temp buffer to be used while RadixSortGlobalScanHistogram + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); + + /* + * @brief Run CLRadixSortGlobalScanHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortPasteHistogram + */ +class CLRadixSortPasteHistogram : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortPasteHistogram(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + */ + CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortPasteHistogram to be copied + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + */ + CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortPasteHistogram to be moved + * @return Reference of this instance + */ + CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[out] glob_sum_buf Buffer of global sum + * @param[in] bits Number of bits to be used for radix sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + /* + * @brief Run CLRadixSortPasteHistogram op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** + * @brief Class to define CLRadixSortReorder + */ +class CLRadixSortReorder : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLRadixSortReorder(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + */ + CLRadixSortReorder(const CLRadixSortReorder &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLRadixSortReorder to be copied + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + */ + CLRadixSortReorder(CLRadixSortReorder &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLRadixSortReorder to be moved + * @return Reference of this instance + */ + CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] hist_buf Buffer of histogram + * @param[in] bits Number of bits to be used for radix sort + * @param[in] n Integer number size to sort + * return N/A + */ + void configure(cl::Buffer *hist_buf, int bits, int n); + + /** + * @brief Set pass + * @param[in] pass Passes made of in radix sort algorithm + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + /* + * @brief Run CLRadixSortReorder op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2FindFirstNegative + */ +class CLTopKV2FindFirstNegative : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2FindFirstNegative(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + */ + CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2FindFirstNegative to be copied + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + */ + CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2FindFirstNegative to be moved + * @return Reference of this instance + */ + CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set output buffer + * @param[out] out_key_buf Buffer of output key + * return N/A + */ + void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } + + /* + * @brief Run CLTopKV2FindFirstNegative op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_out_key_buf; +}; + +/** + * @brief Class to define CLTopKV2ReorderNegatives + */ +class CLTopKV2ReorderNegatives : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2ReorderNegatives(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + */ + CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2ReorderNegatives to be copied + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + */ + CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2ReorderNegatives to be moved + * @return Reference of this instance + */ + CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] first_negative_idx_buf Buffer of the first negative index + * @param[in] n Number times to find + * return N/A + */ + void configure(cl::Buffer *first_negative_idx_buf, int n); + + /** + * @brief Set buffers + * @param[in] in_key_buf Buffer of input key + * @param[out] out_key_buf Buffer of output key + * @param[in] in_ind_buf Buffer of input index + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + + /* + * @brief Run CLTopKV2ReorderNegatives op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +/** + * @brief Class to define CLTopKV2Store + */ +class CLTopKV2Store : public ICLKernel +{ +public: + /** + * @brief Constructor + */ + CLTopKV2Store(); + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + */ + CLTopKV2Store(const CLTopKV2Store &) = delete; + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers). + * @param [in] copiedInstance Const reference of CLTopKV2Store to be copied + * @return Reference of this instance + */ + CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + */ + CLTopKV2Store(CLTopKV2Store &&) = default; + /** + * @brief Allow instances of this class to be moved + * @param [in] movedInstance Rvalue reference of CLTopKV2Store to be moved + * @return Reference of this instance + */ + CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + + /** + * @brief Initialise kernel with params + * @param[out] values Values tensor to store + * @param[out] indices Indices tensor to be used for store + * @param[in] k K of the top k predictions + * @param[in] n Number times to store + * return N/A + */ + void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + + /** + * @brief Set buffers + * @param[out] out_key_buf Buffer of output key + * @param[out] out_ind_buf Buffer of output index + * return N/A + */ + void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); + + /* + * @brief Run CLTopKV2Store op + * @param[in] window Window to be used for in_slice + * @param[in] queue cl::CommandQueue + * @return N/A + */ + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_values; + ICLTensor *_indices; + cl::Buffer *_out_key_buf; + cl::Buffer *_out_ind_buf; +}; + +} // namespace arm_compute +#endif // Disable GPU implementation +#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h new file mode 100644 index 000000000..933d8760d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ +#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +class ITensor; +class Window; +class QuantizationInfo; +} // namespace arm_compute + +namespace arm_compute +{ + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)); + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)); +} // namespace arm_compute +#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h new file mode 100644 index 000000000..8c544cda8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ + +class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel +{ +public: + /** Default destructor */ + ~NEBinaryLogicalOperationKernel() = default; + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[in] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] op Binary logical operation to be executed. + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a Status + */ + static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1, + const ITensorInfo *input2, const ITensorInfo *output); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, + const ITensorInfo &output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h new file mode 100644 index 000000000..101f6ac8e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class for the kernel converting boolean type + */ +class NECastBoolKernel : public INEKernel +{ +public: + const char *name() const override { return "NECastBoolKernel"; } + /** Default constructor*/ + NECastBoolKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel(const NECastBoolKernel &) = delete; + /** Default move constructor */ + NECastBoolKernel(NECastBoolKernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel &operator=(const NECastBoolKernel &) = delete; + /** Default move assignment operator */ + NECastBoolKernel &operator=(NECastBoolKernel &&) = default; + /** Set the input and output of the kernel + * + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NECastBoolKernel + * + * @param[in] input Source tensor info. Data types supported: U8 + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h new file mode 100644 index 000000000..88f21c96e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform EmbeddingLookup operation */ +class NEEmbeddingLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEEmbeddingLookupKernel"; } + /** Default constructor */ + NEEmbeddingLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel(const NEEmbeddingLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEEmbeddingLookupKernel &operator=(const NEEmbeddingLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel(NEEmbeddingLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEmbeddingLookupKernel &operator=(NEEmbeddingLookupKernel &&) = default; + /** Initialize the kernel's input, output. + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups are 1D tensor that values are indices into the first dimension of + * input. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEEmbeddingLookupKernel + * + * @param[in] input Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Destination tensor. Data types supported: same as @p input. + * @param[in] lookups Lookups info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + const ITensor *_lookups; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h new file mode 100644 index 000000000..5acfde5a8 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ +#define __ARM_COMPUTE_NEGATHERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform other operation on NEON */ +class NEGatherKernelEx : public INEKernel +{ +public: + /** Default constructor. */ + NEGatherKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx(const NEGatherKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx(NEGatherKernelEx &&) = default; + /** Allow instances of this class to be moved. */ + NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default; + /** Default detructor */ + ~NEGatherKernelEx() = default; + + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEGatherKernelEx"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Negative values + * wrap around. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the gather operation for 0 axis. + * + * For gather on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info); + + /** Implementation of the gather operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info); + + using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info); + + const ITensor *_input; + const ITensor *_indices; + int _axis; + size_t _indices_rank; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h new file mode 100644 index 000000000..cb2a485d5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HashtableLookup operation */ +class NEHashtableLookupKernel : public INEKernel +{ +public: + const char *name() const override { return "NEHashtableLookupKernel"; } + /** Default constructor */ + NEHashtableLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel(const NEHashtableLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEHashtableLookupKernel &operator=(const NEHashtableLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel(NEHashtableLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHashtableLookupKernel &operator=(NEHashtableLookupKernel &&) = default; + /** Initialize the kernel's inputs, outputs. + * + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * input. + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEHashtableLookupKernel + * + * @param[in] lookups The lookups tensor info. Data types supported: S32. + * @param[in] keys The keys tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input The input tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output The output tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits The hits tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_lookups; /** Lookups tensor */ + const ITensor *_keys; /** Keys tensor */ + const ITensor *_input; /** Source tensor */ + ITensor *_output; /** Destination tensor */ + ITensor *_hits; /** Hits tensor */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h new file mode 100644 index 000000000..8724cc69b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for performing an instance normalization */ +class NEInstanceNormalizationLayerKernelEx : public INEKernel +{ +public: + const char *name() const override { return "NEInstanceNormalizationLayerKernelEx"; } + /** Default constructor */ + NEInstanceNormalizationLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEInstanceNormalizationLayerKernelEx & + operator=(const NEInstanceNormalizationLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + NEInstanceNormalizationLayerKernelEx & + operator=(NEInstanceNormalizationLayerKernelEx &&) = default; + /** Default destructor */ + ~NEInstanceNormalizationLayerKernelEx() = default; + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. Data types supported: F16/F32. Data layout supported: + * NCHW + * In case of @p output tensor = nullptr this tensor will store the result + * of the normalization. + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma = nullptr, ITensor *beta = nullptr, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Common signature for all the specialized instance normalization functions + * + * @param[in, out] input An input tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * @param[out] output The output tensor. + * @param[in] gamma The scale scalar value applied to the normalized tensor. Defaults to + * 1.0 + * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to + * 0.0 + * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 + */ + using NormalizationFunction = void(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window); + + NormalizationFunction *_func; + ITensor *_input; + ITensor *_output; + ITensor *_gamma; + ITensor *_beta; + float _epsilon; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h new file mode 100644 index 000000000..198b0be9d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ +#define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface to multiply scale factor kernel. */ +class NEMultiplyScaleFactorKernel : public INEKernel +{ +public: + const char *name() const override { return "NEMultiplyScaleFactorKernel"; } + /** Default constructor */ + NEMultiplyScaleFactorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel(const NEMultiplyScaleFactorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMultiplyScaleFactorKernel &operator=(const NEMultiplyScaleFactorKernel &) = delete; + /** Default Move Constructor. */ + NEMultiplyScaleFactorKernel(NEMultiplyScaleFactorKernel &&) = default; + /** Default move assignment operator */ + NEMultiplyScaleFactorKernel &operator=(NEMultiplyScaleFactorKernel &&) = default; + /** Default destructor */ + ~NEMultiplyScaleFactorKernel() = default; + /** Set input, output tensors. + * + * @param[in/out] input Source tensor. Data type supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p scale_factor. + */ + void configure(const ITensor *input, const ITensor *scale_factor, ITensor *output, + float multiplier = 1.f); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEMultiplyScaleFactorKernel + * + * @param[in] input Input tensor info. Data types supported: S32. + * @param[in] scale_factor Scale tensor. Data type supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p scale_factor. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier = 1.f); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void multiply(const Window &window); + +private: + const ITensor *_input; + const ITensor *_scale_factor; + ITensor *_output; + float _multiplier; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h new file mode 100644 index 000000000..99bb351bc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#define __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Kernel to perform other operation on NEON */ +class NEOneHotKernel : public INEKernel +{ +public: + /** Default constructor. */ + NEOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel(const NEOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel &operator=(const NEOneHotKernel &) = delete; + /** Allow instances of this class to be moved. */ + NEOneHotKernel(NEOneHotKernel &&) = default; + /** Allow instances of this class to be moved. */ + NEOneHotKernel &operator=(NEOneHotKernel &&) = default; + /** Default detructor */ + ~NEOneHotKernel() = default; + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEOneHotKernel"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up to + * 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: Same + * as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: + * up to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the onehot operation for 0 axis. + * + * For onehot on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info); + /** Implementation of the onehot operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info); + using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info); + const ITensor *_indices; + const ITensor *_depth; + const ITensor *_on_value; + const ITensor *_off_value; + int _axis; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h new file mode 100644 index 000000000..0b080cf73 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ +#define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the dequantization layer kernel. */ +class NEQuantizationSymmetricKernel : public INEKernel +{ +public: + const char *name() const override { return "NEQuantizationSymmetricKernel"; } + /** Default constructor */ + NEQuantizationSymmetricKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel(const NEQuantizationSymmetricKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationSymmetricKernel &operator=(const NEQuantizationSymmetricKernel &) = delete; + /** Default Move Constructor. */ + NEQuantizationSymmetricKernel(NEQuantizationSymmetricKernel &&) = default; + /** Default move assignment operator */ + NEQuantizationSymmetricKernel &operator=(NEQuantizationSymmetricKernel &&) = default; + /** Default destructor */ + ~NEQuantizationSymmetricKernel() = default; + /** Set input, output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[out] output Destination tensor with the same dimensions of input. Data type supported: + * S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output, ITensor *scale_factor); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEQuantizationSymmetricKernel + * + * @param[in] input Input tensor info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: S8. + * @param[out] scale_factor Scale tensor of @p output. Data type supported: Same as @p input. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + template <typename T> void quantize(const Window &window); + +private: + const ITensor *_input; + ITensor *_output; + ITensor *_scale_factor; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h new file mode 100644 index 000000000..cda8a30b1 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_TYPESEX_H__ +#define __ARM_COMPUTE_TYPESEX_H__ + +namespace arm_compute +{ + +/** Available ArgIndex operations **/ +enum class ArgOperation +{ + MAX, + MIN, +}; + +/** Available binary logical operations */ +enum class BinaryLogicalOperation +{ + AND, /**< AND */ + OR, /**< OR */ +}; + +enum class ComparisonOperationEx +{ + EQUAL, /**< EQUAL */ + NOT_EQUAL, /**< NOT_EQUAL */ +}; + +enum class ElementWiseUnaryEx +{ + NEG, /**< NEG */ +}; + +enum class SubDataType +{ + NONE, + BOOL, +}; + +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPESEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h new file mode 100644 index 000000000..d57e8fcf5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_UTILSEX_H__ +#define __ARM_COMPUTE_UTILSEX_H__ + +#include <utility> + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ + +/** Returns expected width and height of the transpose convolution's output tensor. + * + * @note This function was copied in order to fix a bug computing to wrong output dimensions. + * + * @param[in] in_width Width of input tensor (Number of columns) + * @param[in] in_height Height of input tensor (Number of rows) + * @param[in] kernel_width Kernel width. + * @param[in] kernel_height Kernel height. + * @param[in] info padding and stride info. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_top The number of zeros added to bottom edge of the output. + * + * @return A pair with the new width in the first position and the new height in the second. + */ +const std::pair<unsigned int, unsigned int> +transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_top); +} +#endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h new file mode 100644 index 000000000..1e69f0912 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ +#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Utils.h" + +#include "arm_compute/core/utils/helpers/tensor_transform.h" + +#include <cmath> + +namespace arm_compute +{ +namespace misc +{ +namespace shape_calculator +{ + +/** Calculate the upsampled output shape used for transpose convolution + * + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * @param[in] info Padding and stride info + * @param[in] out_dims Output shape dimensions + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[out] pad_left Padding on left + * @param[out] pad_right Padding on right + * @param[out] pad_top Padding on top + * @param[out] pad_bottom Padding on bottom + * + * @return the calculated shape + */ +inline TensorShape compute_transposeconv_upsampled_shape( + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) +{ + unsigned int sx = info.stride().first; + unsigned int sy = info.stride().second; + const DataLayout data_layout = input.data_layout(); + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Find the upsampled dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + // upsample out: + // upsample_out = 1 + (in - 1) * stride + unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1; + unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1; + + // Find the padding needed for the convolution with stride 1 in order to match output shape + // upsample+pad out: + // upsample_out + pad = tconv_out + kernel - 1 + // pad = tconv_out + kernel - 1 - upsample_out + unsigned int padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1); + unsigned int pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1); + out_x += padx; + out_y += pady; + + unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; + unsigned int pady_all_except_invallid = + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); + pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; + pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); + pad_bottom = pady_all_except_invallid / 2 - info.pad_bottom() + invalid_bottom; + + TensorShape scale_out_shape(input.tensor_shape()); + scale_out_shape.set(idx_w, out_x); + scale_out_shape.set(idx_h, out_y); + + return scale_out_shape; +} + +/** Calculate the output shape of the transpose convolution layer + * + * @param[in] out_dims Output x and y shape dimensions + * @param[in] input Input tensor info + * @param[in] weights Weights tensor shape + * + * @return the calculated shape + */ +inline TensorShape +compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, + const ITensorInfo &input, const ITensorInfo &weights) +{ + const TensorShape input_shape{input.tensor_shape()}; + const TensorShape weights_shape{weights.tensor_shape()}; + + const DataLayout data_layout = input.data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + TensorShape out_shape{input_shape}; + out_shape.set(width_idx, out_dims.first); + out_shape.set(height_idx, out_dims.second); + out_shape.set(channel_idx, weights_shape[batch_idx]); + return out_shape; +} + +/** Calculate the depth to space output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int block) +{ + ARM_COMPUTE_ERROR_ON(block < 2); + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(idx_width, input->dimension(idx_width) * block); + output_shape.set(idx_height, input->dimension(idx_height) * block); + output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block)); + + return output_shape; +} + +/** Calculate the space to batch output shape of a tensor + * + * @param[in] input Input tensor info + * @param[in] block_shape Block shape value + * + * @return the calculated shape + */ +inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int32_t block_shape) +{ + ARM_COMPUTE_ERROR_ON(block_shape < 2); + TensorShape output_shape{input->tensor_shape()}; + + const DataLayout data_layout = input->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape); + output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape); + output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape)); + + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape, + const TensorShape &indices_shape, uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); + ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); + + TensorShape output_shape = input_shape; + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } + return output_shape; +} + +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth, + uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions()); + + TensorShape output_shape; + output_shape.set(actual_axis, depth); + + unsigned int i_shift = 0; + for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i) + { + if (i == actual_axis) + { + i_shift++; + } + output_shape.set(i + i_shift, indices_shape[i]); + } + + return output_shape; +} + +} // namespace shape_calculator +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_EX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h new file mode 100644 index 000000000..484ebfd0b --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ +#define __ARM_COMPUTE_CLFUNCTIONSEX_H__ + +#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h> +#include <arm_compute/runtime/CL/functions/CLCastBool.h> +#include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/CL/functions/CLGatherEx.h> +#include <arm_compute/runtime/CL/functions/CLHashtableLookup.h> +#include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/CL/functions/CLNeg.h> +#include <arm_compute/runtime/CL/functions/CLOneHot.h> +#include <arm_compute/runtime/CL/functions/CLReduceOperation.h> +#include <arm_compute/runtime/CL/functions/CLSplitVEx.h> +#include <arm_compute/runtime/CL/functions/CLTopKV2.h> +#include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h> + +#endif // __ARM_COMPUTE_CLFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h new file mode 100644 index 000000000..b1ee52bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ +#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ + +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" +#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +class ITensorInfo; +class ICLTensor; + +/** Function to calculate the index of the minimum or maximum values in a + * tensor based on an axis. + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerEx : public IFunction +{ +public: + /** Default Constructor. + * + * @param[in] memory_manager (Optional) Memory manager. + */ + CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[out] output Output source tensor. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + */ + void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerEx + * + * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[in] output Output source tensor info. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<CLTensor> _results_vector; + CLTensor _not_reshaped_output; + std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector; + CLReshapeLayerKernel _reshape_kernel; + unsigned int _num_of_stages; + unsigned int _reduction_axis; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h new file mode 100644 index 000000000..88a9b00ec --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLBINARYLOGICALOP_H__ +#define __ARM_COMPUTE_CLBINARYLOGICALOP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/TypesEx.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLBinaryLogicalOp : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input1 Source tensor1. Data types supported: U8, QASYMM8. + * @param[in] input2 Source tensor2. Data types supported: U8 QASYMM8. + * @param[out] output Output tensor. Data types supported: U8, QASYMM8. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op); +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLBINARYLOGICALOP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h new file mode 100644 index 000000000..d6150684a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLCastBool.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCastBool class + */ + +#ifndef ARM_COMPUTE_CLCASTBOOL_H +#define ARM_COMPUTE_CLCASTBOOL_H + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to run @ref CLCastBoolKernel. + * This converts the boolean input tensor to the output tensor's type. + */ +class CLCastBool : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's input and output + * @param[in] input Input tensor. Data types supported: U8 + * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32. + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} +#endif /* ARM_COMPUTE_CLCASTBOOL_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h new file mode 100644 index 000000000..409eaf593 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/CL/functions/CLReverse.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +#include <memory> + +namespace arm_compute +{ +class ICLTensor; +/** Function to run the deconvolution layer. + * + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perform a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input and pad is the amount of padding. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where: + * width_input is the size of the first input dimension. + * height_input is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse. + * + * This function calls the following OpenCL kernels/functions: + * + * -# @ref CLDeconvolutionLayerUpsample + * -# @ref CLConvolutionLayer + * + * And the following CPP kernels: + * -# @ref CLReverse + * + */ +class CLDirectTransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete; + /** Default move constructor */ + CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete; + /** Default move assignment operator */ + CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this + * is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for + * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLDirectTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, except for input + * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + CLDeconvolutionLayerUpsample _scale_f; + CLConvolutionLayer _conv_f; + CLReverse _flip_weights; + + CLTensor _scaled_output; + ICLTensor *_original_weights; + CLTensor _weights_flipped; + CLTensor _flip_axis; + + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h new file mode 100644 index 000000000..fbee7e40e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class CLEmbeddingLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); +}; +} +#endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h new file mode 100644 index 000000000..f3266f688 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" +#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" +#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls + * the following kernels: + * + * -# @ref CLTransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * S8. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * S8. + * @param[in] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following + * OpenCL kernels: + * + * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric) + * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + CLFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedHybridLayer(const CLFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + CLFullyConnectedHybridLayer(CLFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedHybridLayer &operator=(const CLFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + CLFullyConnectedHybridLayer &operator=(CLFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, + bool retain_internal_weights); + + MemoryGroup _memory_group; + CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel; + CLScaleFactorSymm8Kernel _scale_factor_kernel; + CLQuantizationSymmetricKernel _quant_input_kernel; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLMultiplyScaleFactorKernel _multiply_scale_kernel; + CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to + // add bias in + // CLFullyConnectedHybridLayer + CLTensor _reshape_weights_output; + CLTensor _quantized_input; + CLTensor _scale_factor; + CLTensor _gemmlowp_output; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; + const ICLTensor *_original_weights; +}; +} +#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h new file mode 100644 index 000000000..e65a646dc --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" +#include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls + * the following kernels: + * + * -# @ref CLTransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedLayerReshapeWeightsEx + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor which stores the transposed input tensor. Data type + * supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +namespace weights_transformations +{ +/** Basic function to manage the reshape weights generated from @ref + * CLFullyConnectedLayerReshapeWeightsEx */ +class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights +{ +public: + // Inherited method override + void run() override + { + _output.allocator()->allocate(); + _func.run(); + _reshape_run = true; + } + + // Inherited method override + void release() override { _output.allocator()->free(); } + + // Inherited method override + ICLTensor *get_weights() override { return &_output; } + + // Inherited method override + uint32_t uid() override { return _uid; } + + /** Configures the @ref CLFullyConnectedLayerReshapeWeightsEx function + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + */ + void configure(const ICLTensor *input) { _func.configure(input, &_output); } + +private: + static constexpr uint32_t _uid = 0x0; + CLTensor _output{}; + CLFullyConnectedLayerReshapeWeightsEx _func{}; +}; +} // namespace weights_transformations + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following + * OpenCL kernels: + * + * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLFullyConnectedLayerReshapeWeightsEx (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref + * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CLFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayerEx(const CLFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + CLFullyConnectedLayerEx(CLFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFullyConnectedLayerEx &operator=(const CLFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + CLFullyConnectedLayerEx &operator=(CLFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, + ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const FullyConnectedLayerInfo &fc_info); + + MemoryGroup _memory_group; + IWeightsManager *_weights_manager; + CLConvertFullyConnectedWeights _convert_weights; + weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; + weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged + _reshape_weights_managed_function; + CLFlattenLayer _flatten_layer; + CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; + CLGEMM _mm_gemm; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLTensor _flatten_output; + CLTensor _converted_weights_output; + CLTensor _reshape_weights_output; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _is_quantized; + bool _is_prepared; + const ICLTensor *_original_weights; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..289ab167f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       CLFullyConnectedReshapingLayer.h + * @brief      This file contains CLFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/CL/CLTensor.h> +#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class CLFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @return N/A + */ + void configure(const arm_compute::ICLTensor *input, const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, arm_compute::ICLTensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + const arm_compute::ICLTensor *_input; + const arm_compute::ICLTensor *_weights; + const arm_compute::ICLTensor *_biases; + arm_compute::ICLTensor *_output; + + // buffer for reshaping input tensor + arm_compute::CLTensor _cl_buffer; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + std::unique_ptr<arm_compute::IFunction> _cl_fc; + CLReshapeLayer _cl_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_CL_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h new file mode 100644 index 000000000..b01ec4255 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLGatherEx.h + * @brief This file contains CLGatherEx class + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_CLGATHEREX_H__ +#define __ARM_COMPUTE_CLGATHEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to to run @ref CLGatherKernel. + */ +class CLGatherEx : public ICLSimpleFunction +{ +public: + /** + * @brief Initialise the kernel's inputs, output and convertion policy. + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return N/A + */ + void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); + + /** + * @brief Static function to check if given info will lead to a valid configuration + * of @ref CLGatherEx + * @param[in] input An input tensor. Data types supported: U8/QASYMM8/S32/F32. + * @param[in] indices An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input. + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h new file mode 100644 index 000000000..6618f5aa4 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_CLHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class CLHashtableLookup : public ICLSimpleFunction +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, + ICLTensor *output, ICLTensor *hits); +}; +} +#endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..887e7aaa5 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref CLInstanceNormalizationLayerKernelEx + */ +class CLInstanceNormalizationLayerEx : public ICLSimpleFunction +{ +public: + /** Default constructor */ + CLInstanceNormalizationLayerEx(); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults + * to nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma = nullptr, + ICLTensor *beta = nullptr, float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLInstanceNormalizationLayerEx. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] beta (Optional) The offset tensor applied to the normalized tensor. Defaults to + * nullptr + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h new file mode 100644 index 000000000..8ec9aa307 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLNeg.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_CLNEG_H__ +#define __ARM_COMPUTE_CLNEG_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +class CLNeg : public ICLSimpleFunction +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] input Source tensor. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + * + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLNEG_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h new file mode 100644 index 000000000..2bbfca821 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOT_H__ +#define __ARM_COMPUTE_CLONEHOT_H__ +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/runtime/IFunction.h" +namespace arm_compute +{ +class ICLTensor; +/** Basic function to run @ref CLOneHotKernel */ +class CLOneHot : public IFunction +{ +public: + /** Constructor */ + CLOneHot(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot(const CLOneHot &) = delete; + /** Default move constructor */ + CLOneHot(CLOneHot &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot &operator=(const CLOneHot &) = delete; + /** Default move assignment operator */ + CLOneHot &operator=(CLOneHot &&) = default; + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and outputs with off_value being constant + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] off_value The PixelValue for off value. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + + // Inherited methods overridden: + void run() override; + +private: + CLMemsetKernel _memset_kernel; /**< Memset kernel */ + CLOneHotKernel _onehot_kernel; /**< OneHot kernel */ + bool _has_to_memset; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h new file mode 100644 index 000000000..bb852e404 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLReduceOperation.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLReduceOperation class + */ + +#ifndef __ARM_COMPUTE_CLREDUCEOPERATION_H__ +#define __ARM_COMPUTE_CLREDUCEOPERATION_H__ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to perform ReduceOperation + */ +class CLReduceOperation : public IFunction +{ +public: + /** + * @brief Construct a new ReduceOperation object + */ + CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager); + + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. Data types supported: U8/S32/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return N/A + */ + void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, + bool keep_dims, ReductionOperation op); + + /** + * @brief Static function to check if given info will lead to a valid configuration of @ref + * CLReduceOperation. + * @param[in] input Source tensor info. Data types supported: U8/S32/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. It must be sorted and no duplicates. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] op Reduce operation to perform. + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReductionOperation &op); + + /** + * @brief Run the OpenCL kernel for this operation + * @return N/A + */ + void run() override; + +private: + MemoryGroup _memory_group; + ICLTensor *_input; + ICLTensor *_output; + std::set<uint32_t> _axis; + bool _keep_dims; + + std::unique_ptr<CLTensor[]> _interm_tensors{nullptr}; + std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; + CLReshapeLayer _reshape; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h new file mode 100644 index 000000000..bb741d98d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLSPLITVEX__ +#define __ARM_COMPUTE_CLSPLITVEX__ + +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/CL/functions/CLSlice.h" +#include "arm_compute/core/Types.h" +#include <vector> +#include <memory> + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSplitVKernel */ +class CLSplitVEx : public IFunction +{ +public: + /** Default constructor */ + CLSplitVEx(); + /** Configure the split CL kernel + * + * @param[in] input The input tensor to split. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] size_splits A 1-D tensor containing the number of tensor values per split + * @param[out] outputs A vector containing the output tensor. Data types supported: Same as @p + * input + * The output tensors should match the input tensor dimensions for all + * shape dimensions apart + * from the split dimension. + * @param[in] split_dim Integer value representing the input tensor dimension along which to + * split + * @param[in] num_splits Number of splits + */ + void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits); + + void run() override; + +private: + const ICLTensor *_input; + const ICLTensor *_size_splits; + std::vector<ICLTensor *> _outputs; + unsigned int _num_splits; + std::vector<CLSlice> _slice_functions; +}; +} +#endif /* __ARM_COMPUTE_CLSPLITVEX__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h new file mode 100644 index 000000000..e301a5152 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLTopKV2.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLTopKV2 class + */ +#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ +#define __ARM_COMPUTE_CLTOPK_V2_H__ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class to execute TopKV2 operation. + */ +class CLTopKV2 : public IFunction +{ +public: + /** + * @brief Construct a new CLTopKV2 object + */ + CLTopKV2(); + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2(const CLTopKV2 &) = delete; + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + */ + CLTopKV2 &operator=(const CLTopKV2 &) = delete; + + /** + * @brief Construct a new CLTopKV2 object by using copy constructor + * @param[in] CLTopKV2 object to move + */ + CLTopKV2(CLTopKV2 &&) = default; + + /** + * @brief Assign a CLTopKV2 object. + * @param[in] CLTopKV2 object to assign. This object will be moved. + */ + CLTopKV2 &operator=(CLTopKV2 &&) = default; + + /** + * @brief Initialise the kernel's inputs and outputs. + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[in] k The value of `k`. + * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if + * input type is F32. + * @param[out] indices Indices related to top k values. Data types supported: S32 if input type + * is U8/S16, F32 if input type is F32. + * @return N/A + */ + void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits = 32, int bits = 4); + + /** + * @brief Run the kernels contained in the function + * Depending on the value of the following environment variables it works differently: + * - If the value of environment variable "ACL_TOPKV2" == "GPU_SINGLE", + * quick sort on GPU is used. + * - If the value of environment variable "ACL_TOPKV2" == ""GPU"", + * radix sort on GPU is used. + * - For other value, TopKV2 runs on CPU + * @return N/A + */ + void run() override; + +private: + void run_on_cpu(); + void run_on_gpu(); + void run_on_gpu_single_quicksort(); + + uint32_t _k; + uint32_t _total_bits; + uint32_t _bits; + uint32_t _radix; + uint32_t _hist_buf_size; + uint32_t _glob_sum_buf_size; + uint32_t _n; + + ICLTensor *_input; + ICLTensor *_values; + ICLTensor *_indices; + + cl::Buffer _qs_idx_buf; + cl::Buffer _qs_temp_buf; + cl::Buffer _hist_buf; + cl::Buffer _glob_sum_buf; + cl::Buffer _temp_buf; + cl::Buffer _first_negative_idx_buf; + cl::Buffer _in_key_buf; + cl::Buffer _out_key_buf; + cl::Buffer _in_ind_buf; + cl::Buffer _out_ind_buf; + + cl::Buffer *_p_in_key_buf; + cl::Buffer *_p_out_key_buf; + cl::Buffer *_p_in_ind_buf; + cl::Buffer *_p_out_ind_buf; +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + CLTopKV2Single _qs_kernel; + CLTopKV2Init _init_kernel; + CLRadixSortHistogram _hist_kernel; + CLRadixSortScanHistogram _scan_hist_kernel; + CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; + CLRadixSortPasteHistogram _paste_hist_kernel; + CLRadixSortReorder _reorder_kernel; + CLTopKV2FindFirstNegative _find_first_negative_kernel; + CLTopKV2ReorderNegatives _reorder_negatives_kernel; + CLTopKV2Store _store_kernel; +#endif +}; +} +#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h new file mode 100644 index 000000000..5fb102e47 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" +#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +/** Basic function to compute the deconvolution layer. This function calls the following OpenCL + * kernels/functions: + * + * -# @ref CLGEMMDeconvolutionLayer + * -# @ref CLDirectTransposeConvLayer + */ +class CLTransposeConvLayer : public IFunction +{ +public: + /** Default constructor */ + CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same + * as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this + * is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + */ + void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); + /** Set the input, weights, biases and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and + * an optional 4th dimension for batch of inputs. Data types supported: + * QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: + * Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref + * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref + * CLWeightsReshapeKernel. + * + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLTransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data + * type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as + * @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the + * @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is + * described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info = WeightsInfo()); + + static DeconvolutionMethod + get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info); + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + std::unique_ptr<IFunction> _function; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h new file mode 100644 index 000000000..efc296d6c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ +#define __ARM_COMPUTE_NEFUNCTIONSEX_H__ + +#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> +#include <arm_compute/runtime/NEON/functions/NECastBool.h> +#include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> +#include <arm_compute/runtime/NEON/functions/NEGatherEx.h> +#include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h> +#include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEOneHot.h> +#include <arm_compute/runtime/NEON/functions/NEReduceSum.h> +#include <arm_compute/runtime/NEON/functions/NEReduceOperation.h> +#include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h> + +#endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h new file mode 100644 index 000000000..026d30098 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ +#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ + +#include "arm_compute/core/TypesEx.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +class NEBinaryLogicalOperation : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8. + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] op Binary Logical Operation to be performed. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op); +}; + +/** Basic function to run @ref NEBinaryLogicalOperationKernel + * + * @note The tensor data type for the inputs must be QASYMM8/U8. + * @note The function performs a binary logical operation between two tensors. + */ +template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8 + * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + */ + void configure(ITensor *input1, ITensor *input2, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEBinaryLogicalOperationKernel + * + * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8 + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); +}; + +/** Basic function to run equal comparison. */ +using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +/** Basic function to run not equal comparison. */ +using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h new file mode 100644 index 000000000..c8b08af8d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTBOOL_H__ +#define __ARM_COMPUTE_NECASTBOOL_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to run @ref NECastBoolKernel. + */ +class NECastBool : public INESimpleFunction +{ +public: + /** Initialize the function's source, destination + * + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NECastBool + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h new file mode 100644 index 000000000..63f7714aa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file NEEmbeddingLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEEmbeddingLookup class + */ + +#ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ +#define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform EmbeddingLookup operation + */ +class NEEmbeddingLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32. + * @return N/A + */ + void configure(const ITensor *input, ITensor *output, const ITensor *lookups); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] input Source tensor info. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * @param[in] output Lookups tensor info. Data types supported: S32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups); +}; +} +#endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h new file mode 100644 index 000000000..56548a479 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls + * the following kernels: + * + * -# @ref NETransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayerReshapeWeights + * + * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: + * QASYMM8/F16/F32. + * @param[in] output Destination tensor info. Data type supported: Same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; + +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * and transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class NEFullyConnectedHybridLayer : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedHybridLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer(const NEFullyConnectedHybridLayer &) = delete; + /** Default move constructor */ + NEFullyConnectedHybridLayer(NEFullyConnectedHybridLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedHybridLayer &operator=(const NEFullyConnectedHybridLayer &) = delete; + /** Default move assignment operator */ + NEFullyConnectedHybridLayer &operator=(NEFullyConnectedHybridLayer &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedHybridLayer + * + * @param[in] input Source tensor info. Data type supported: F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: S8. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NEQuantizationSymmetricKernel _quant_input_kernel; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEMultiplyScaleFactorKernel _multiply_scale_kernel; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _reshape_weights_output; + Tensor _quantized_input; + Tensor _scale_factor; + Tensor _gemmlowp_output; + const ITensor *_original_weights; + bool _are_weights_reshaped; + bool _accumulate_biases; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h new file mode 100644 index 000000000..8f98f220a --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ +#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/NEON/functions/NEGEMM.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +/** Basic function to compute a Fully Connected layer on NEON. This function calls the following + * NEON kernels: + * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and + * transpose_weights is set to true ) (called once) + * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized + * asymmetric) + * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is + * not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + * @note The difference from NEFullyConnectedLayer is that this class supports weights as input + * with performance loss. + */ +class NEFullyConnectedLayerEx : public IFunction +{ +public: + /** Constructor */ + NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; + /** Default move constructor */ + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; + /** Default move assignment operator */ + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix + * multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, + ITensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEFullyConnectedLayerEx + * + * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) + * weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) + * weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor info. Its shape should be equal to the output of a + * matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the + * function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is + * called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods override + void run() override; + void prepare() override; + +private: + void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + + MemoryGroup _memory_group; + NEFlattenLayerKernel _flatten_kernel; + NEConvertFullyConnectedWeights _convert_weights; + NEFullyConnectedLayerReshapeWeights _reshape_weights_function; + NEGEMM _mm_gemm; + NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + Tensor _flatten_output; + Tensor _gemmlowp_output; + Tensor _converted_weights_output; + Tensor _reshape_weights_output; + const ITensor *_original_weights; + bool _are_weights_converted; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; + bool _is_quantized; + bool _is_prepared; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h new file mode 100644 index 000000000..18cb61bf9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file       NEFullyConnectedReshapingLayer.h + * @brief      This file contains NEFullyConnectedReshapingLayer class + * @ingroup    COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ +#define __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ + +#include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h> +#include <arm_compute/runtime/IMemoryManager.h> +#include <arm_compute/runtime/Tensor.h> + +namespace arm_compute +{ +/** + * @brief Class to run FullyConnected Layer after reshaping input tensor + */ +class NEFullyConnectedReshapingLayer : public arm_compute::IFunction +{ +public: + enum class KernelType + { + GENERAL, //< General FC + PREPROCESSED_WEIGHTS //< Weights are constants so it can be preprocessed + }; + +public: + NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] weights The tensor that is filled with weight values + * @param[in] biases The tensor that is filled with biase values + * @param[in] output The destination tensor + * @param[in] needs_reshape Whether it needs to be reshaped or not + * @param[in] reshape The tensor shape to be reshaped. Only valid when needs_reshape is true. + * @param[in] kernel_type The kernel type for actual FullyConnected layer + * @return N/A + */ + void configure(const arm_compute::ITensor *input, const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, arm_compute::ITensor *output, + bool needs_reshape, const arm_compute::TensorShape &reshape, + KernelType kernel_type); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + /** + * @brief Prepare the operation + * @return N/A + */ + void prepare(void) override; + +private: + std::shared_ptr<IMemoryManager> _memory_manager; + const arm_compute::ITensor *_input; + const arm_compute::ITensor *_weights; + const arm_compute::ITensor *_biases; + arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + arm_compute::Tensor _neon_buffer; + +private: + std::unique_ptr<arm_compute::IFunction> _neon_fc; + NEReshapeLayer _neon_reshape; + bool _needs_reshape; +}; +} // namespace arm_compute + +#endif // __ARM_COMPUTE_NE_FULLY_CONNECTED_RESHAPING_LAYER_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h new file mode 100644 index 000000000..155a1b837 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEGATHEREX_H__ +#define __ARM_COMPUTE_NEGATHEREX_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEGatherKernelEx */ +class NEGatherEx : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + */ + void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGatherKernelEx + * + * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: + * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] output Destination tensor info. Data type supported: Same as @p input + * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis); +}; + +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h new file mode 100644 index 000000000..521a05ad9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file NEHashtableLookup.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::NEHashtableLookup class + */ + +#ifndef __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ +#define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +#include <vector> + +namespace arm_compute +{ +class ITensor; + +/** + * @brief Class to perform HashtableLookup operation + */ +class NEHashtableLookup : public INESimpleFunctionNoBorder +{ +public: + /** + * @brief Set the input and output tensors. + * @param[in] lookups Lookups 1D tensor that values are indices into the first dimension of + * input. Data types supported: S32 + * @param[in] keys Keys 1D tensor. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p + * input. + * @param[out] hits Hits 1D tensor. A boolean tensor that indicates whether the lookup hits + * (True) or not (False). Data types supported: U8/QASYMM8 + * @return N/A + */ + void configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, + ITensor *hits); + /** Static function to check if given info will lead to a valid configuration of @ref NECopy + * + * @param[in] lookups Lookups 1D tensor info. + * Data types supported: S32 + * @param[in] keys Keys 1D tensor info. keys and input pair represent a map. + * Data types supported: S32 + * @param[in] input Source tensor info. + * Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32 + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] hits Hits 1D tensor info. A boolean tensor that indicates whether the lookup + * hits (True) or not (False). Data types supported: U8/QASYMM8 + * + * @return a status + */ + static Status validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits); +}; +} +#endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h new file mode 100644 index 000000000..18e813923 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ +#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEPermute.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform a Instance normalization. + * + * This function runs the following kernels: + * -# @ref NEInstanceNormalizationLayerKernelEx + */ +class NEInstanceNormalizationLayerEx : public IFunction +{ +public: + /** Constructor */ + NEInstanceNormalizationLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will + * store the result of the normalization. + * Data types supported: F16/F32. Data layout supported: NHWC, NCHW + * @param[out] output Destination tensor. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. + * Defaults to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + */ + void configure(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon = 1e-12f); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEInstanceNormalizationLayer. + * + * @param[in] input Source tensor info. Data types supported: F16/F32. Data layout supported: + * NHWC, NCHW + * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p + * input. + * @param[in] gamma (Optional) The scale scalar value applied to the normalized tensor. Defaults + * to 1.0 + * @param[in] beta (Optional) The offset scalar value applied to the normalized tensor. + * Defaults to 0.0 + * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma = nullptr, const ITensorInfo *beta = nullptr, + float epsilon = 1e-12f); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEInstanceNormalizationLayerKernelEx _normalization_kernel; + bool _is_nchw; + NEPermute _permute_input; + NEPermute _permute_output; + Tensor _permuted_input; + Tensor _permuted_output; +}; +} +#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h new file mode 100644 index 000000000..b2ea6270f --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOT_H__ +#define __ARM_COMPUTE_NEONEHOT_H__ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Basic function to run @ref NEOneHotKernel */ +class NEOneHot : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up + * to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. Supported tensor rank: + * up to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h new file mode 100644 index 000000000..91eec815c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ +#define __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceOperation : public IFunction +{ +public: + /** Constructor */ + NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output, + ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * NEReduceOperation + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * @param[in] op Reduce operation to perform. + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReductionOperation op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_OPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h new file mode 100644 index 000000000..48b416923 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEON_REDUCE_SUM_H__ +#define __ARM_COMPUTE_NEON_REDUCE_SUM_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" +#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to perform reduce operation */ +class NEReduceSum : public IFunction +{ +public: + /** Constructor */ + NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[out] output Destination tensor. Data type supported: Same as @p input + */ + void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReduceSum + * + * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32 + * @param[in] reduction_axis Reduction axis vector. + * @param[in] keep_dims If positive, retains reduced dimensions with length 1. + * @param[in] output Destination tensor. Data type supported: Same as @p input + * + * @return A status + */ + static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<NEReductionOperation> _reduction_kernels; + std::vector<Tensor> _reduced_outs; + NEReshapeLayer _reshape; + unsigned int _reduction_ops; + bool _keep_dims; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEON_REDUCE_SUM_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h new file mode 100644 index 000000000..24ff5dac9 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ +#define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ + +#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReverse.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +/** Function to run the deconvolution layer. + * + * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input + * depending on the stride and pad info and then perfrom a 1x1 + * convolution pass. Input stride defines how many zeroes we should put between each element of the + * input, pad is the amount of padding and finaly a is a user + * specified value where a < stride - 1 that increases the padding top and right of the input image. + * + * The relation between input to output is as follows: + * \f[ + * width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x + * \f] + * \f[ + * height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y + * \f] + * + * where + * width is the size of the first input dimension. + * height is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * stride_x and stride_y is the input stride of the first and second dimension. + * + * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. + * Therefore, it will be necessary to use the weights in the + * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse. + * + * This function calls the following NEON kernels/functions: + * + * -# @ref CPPUpsampleEx + * -# @ref NEConvolutionLayer + * -# @ref NEPermute + * -# @ref NEReverse + * + */ +class NETransposeConvLayer : public IFunction +{ +public: + /** Constructor */ + NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer(const NETransposeConvLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; + /** Allow instances of this class to be moved */ + NETransposeConvLayer(NETransposeConvLayer &&) = default; + /** Allow instances of this class to be moved */ + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + /** Default destructor */ + virtual ~NETransposeConvLayer() = default; + + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type + * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 + * for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + /** Static function to check if given info will lead to a valid configuration of @ref + * NETransposeConvLayer + * + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an + * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type + * supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types + * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p + * input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is + * decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom); + + // Inherited methods overridden: + void run() override; + void prepare() override; + +private: + MemoryGroup _memory_group; + NEConvolutionLayer _conv_f; + CPPUpsample _upsample_f; + NEReverse _flip_weights; + Tensor _scaled_output; + Tensor _weights_flipped; + Tensor _flip_axis; + const ITensor *_original_weights; + ITensor *_input; + PadStrideInfo _info; + bool _is_prepared; +}; +} // arm_compute +#endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/resolve_includes.py b/compute/ARMComputeEx/resolve_includes.py new file mode 100755 index 000000000..f37c2a957 --- /dev/null +++ b/compute/ARMComputeEx/resolve_includes.py @@ -0,0 +1,116 @@ +# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Copyright (c) 2016, 2017 ARM Limited. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import collections +import os.path +import re +import subprocess +import glob + + +def resolve_includes(target, source): + # File collection + FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') + + # Include pattern + pattern = re.compile("#include \"(.*)\"") + + # Get file contents + files = [] + for i in range(len(source)): + src = source[i] + dst = target[i] + f = open(src) + cts = f.read() + f.close() + contents = cts.splitlines() + entry = FileEntry(target_name=dst, file_contents=contents) + files.append((os.path.basename(src), entry)) + + # Create dictionary of tupled list + files_dict = dict(files) + + # Check for includes (can only be files in the same folder) + final_files = [] + for file in files: + done = False + tmp_file = file[1].file_contents + print(file[1].target_name) + while not done: + file_count = 0 + updated_file = [] + for line in tmp_file: + found = pattern.search(line) + if found: + include_file = found.group(1) + data = files_dict[include_file].file_contents + updated_file.extend(data) + else: + updated_file.append(line) + file_count += 1 + + # Check if all include are replaced. + if file_count == len(tmp_file): + done = True + + # Update temp file + tmp_file = updated_file + + # Append and prepend string literal identifiers and add expanded file to final list + tmp_file.insert(0, "R\"(\n") + tmp_file.append("\n)\"") + entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) + final_files.append((file[0], entry)) + + # Write output files + for file in final_files: + with open(file[1].target_name, 'w+') as out_file: + out_file.write("\n".join(file[1].file_contents)) + + +# Generate embed files +cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') +cl_files += glob.glob('src/core/CL/cl_kernels/*.h') + +# DEBUG: print cl files +print("cl_files:") +print(cl_files) + +embed_files = [f + "embed" for f in cl_files] +print("embed_files:") +print(embed_files) + +resolve_includes(embed_files, cl_files) diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 000000000..81d0cb70f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { + // ARMComputeEx kernels + {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast_bool", "cast.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"one_hot", "one_hot.cl"}, + {"one_hot_only_on_value", "one_hot.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, +}; + +const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { +#ifdef EMBEDDED_KERNELS + { + "arg_min_max_ex.cl", +#include "./cl_kernels/arg_min_max_ex.clembed" + }, + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "embedding_lookup.cl", +#include "./cl_kernels/embedding_lookup.clembed" + }, + { + "gather_ex.cl", +#include "./cl_kernels/gather_ex.clembed" + }, + { + "gemmlowp_ex.cl", +#include "./cl_kernels/gemmlowp_ex.clembed" + }, + { + "hashtable_lookup.cl", +#include "./cl_kernels/hashtable_lookup.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "instance_normalization_ex.cl", +#include "./cl_kernels/instance_normalization_ex.clembed" + }, + { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "multiply_scale_factor.cl", +#include "./cl_kernels/multiply_scale_factor.clembed" + }, + { + "neg_tensor.cl", +#include "./cl_kernels/neg_tensor.clembed" + }, + { + "one_hot.cl", +#include "./cl_kernels/one_hot.clembed" + }, + { + "quantization_symm8.cl", +#include "./cl_kernels/quantization_symm8.clembed" + }, + { + "reduce_operation.cl", +#include "./cl_kernels/reduce_operation.clembed" + }, + { + "scale_factor.cl", +#include "./cl_kernels/scale_factor.clembed" + }, + { + "topkv2.cl", +#include "./cl_kernels/topkv2.clembed" + }, + { + "topkv2_radixsort.cl", +#include "./cl_kernels/topkv2_radixsort.clembed" + }, + { + "topkv2_quicksort.cl", +#include "./cl_kernels/topkv2_quicksort.clembed" + }, + +#endif /* EMBEDDED_KERNELS */ +}; + +CLKernelLibraryEx::CLKernelLibraryEx() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ + opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the + // CLKernelLibraryEx is built +} + +CLKernelLibraryEx &CLKernelLibraryEx::get() +{ + static CLKernelLibraryEx _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, + const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if (_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + std::string concat_str; + + if (fp16_supported()) + { + concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; + } + + if (get_cl_version(_device) == CLVersion::CL20) + { + concat_str += " -cl-std=CL2.0 "; + } + else if (arm_non_uniform_workgroup_supported(_device)) + { + concat_str += " -cl-arm-non-uniform-work-group-size "; + } + else + { + ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set) + concat_str; + + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if (_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, + cl::Program program) +{ + _built_programs_map.emplace(built_program_name, program); +} + +bool CLKernelLibraryEx::fp16_supported() const { return ::fp16_supported(_device); } + +bool CLKernelLibraryEx::int64_base_atomics_supported() const +{ + return device_supports_extension(_device, "cl_khr_int64_base_atomics"); +} + +const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if (program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if (_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if (std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, + std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if (std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const +{ + std::string concat_set; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for (const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} + +std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) +{ + const auto program_source_it = _program_source_map.find(program_name); + + if (program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); + } + + return program_source_it->second; +} + +size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const +{ + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG( + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); + + return result; +} + +cl::NDRange CLKernelLibraryEx::default_ndrange() const +{ + // GPUTarget _target = get_target_from_device(_device); + cl::Device device = cl::Device::getDefault(); + GPUTarget _target = get_target_from_device(device); + cl::NDRange default_range; + + switch (_target) + { + case GPUTarget::MIDGARD: + case GPUTarget::T600: + case GPUTarget::T700: + case GPUTarget::T800: + default_range = cl::NDRange(128u, 1); + break; + default: + default_range = cl::NullRange; + } + + return default_range; +} + +std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl new file mode 100644 index 000000000..0a014d15c --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(FLOAT_DATA_TYPE) +#define ISGREATER(x, y) isgreater(x, y) +#define ISLESS(x, y) isless(x, y) +#else // !FLOAT_DATA_TYPE +#if defined(WIDTH) +#define ISGREATER(x, y) (x > y) ? 1 : 0 +#define ISLESS(x, y) (x < y) ? 1 : 0 +#else // !defined(WIDTH) +#define ISGREATER(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y) +#define ISLESS(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y) +#endif // defined(WIDTH) +#endif // defined(FLOAT_DATA_TYPE) + +#if defined(ARG_MAX) +#define CONDITION_TO_USE(x, y) ISGREATER(x, y) +#elif defined(ARG_MIN) +#define CONDITION_TO_USE(x, y) ISLESS(x, y) +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) +#error "Unsupported reduction operation!" +#endif // defined(ARG_MAX) + +#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) +#if defined(WIDTH) +#if defined(ARG_MIN) +#if defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res)); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + res = select(res, x_v, *(input + x_v) < *(input + res)); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 <= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = (in.s0123 < in.s4567) || + (in.s0123 == in.s4567 && + CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 < in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MIN) +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + unsigned int res_int = res; + DATA_TYPE_OUTPUT condition_check2; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + int i1 = prev_res[x_v]; + condition_check2 = *(input + i1) > *(input + res_int); + res = select(res, prev_res[x_v], condition_check2); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + unsigned int i1; + unsigned int i2; + DATA_TYPE_OUTPUT condition_check; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + i1 = x_v; + i2 = res; + condition_check = *(input + i1) > *(input + i2); + res = select(res, x_v, condition_check); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 >= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = (in.s0123 > in.s4567) || + (in.s0123 == in.s4567 && + CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 > in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) + +/** This kernel performs parallel reduction given an operation on x-axis. + * + * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed + * using -DPREV_OUTPUT + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the + * ArgMax + * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the + * ArgMin + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] prev_res_ptr (Optional) Pointer to previous results + * tensor. Supported data types: U32/S32 + * @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X + * dimension (in bytes) + * @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y + * dimension (in bytes) + * @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element + * in the previous results tensor + * @param[in] partial_res_ptr The local buffer to hold partial result + * values. Supported data types: U32/S32 + * @param[in] partial_res_stride_x Stride of the output tensor in X dimension + * (in bytes) + * @param[in] partial_res_step_x partial_res_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension + * (in bytes) + * @param[in] partial_res_step_y partial_res_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] local_results Local buffer for storing the partial result + */ +__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), +#if defined(PREV_OUTPUT) + IMAGE_DECLARATION(prev_res), +#endif // defined(PREV_OUTPUT) + IMAGE_DECLARATION(partial_res), + __local DATA_TYPE_OUTPUT *local_results) +{ +#if defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); + Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res); +#else // !defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#endif // defined(PREV_OUTPUT) + Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res); + + unsigned int lsize = get_local_size(0); + unsigned int lid = get_local_id(0); + + const uint x_idx = get_global_id(0); + const uint y_idx = get_global_id(1); + const __global DATA_TYPE *src_in_row = + (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + + y_idx * src_step_y); + + for (unsigned int y = 0; y < get_local_size(1); ++y) + { +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) + local_results[lid] = arg_idx_max_prev_out( + src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#else // defined(ARG_MIN) +#if defined(PREV_OUTPUT) + local_results[lid] = arg_idx_min_prev_out( + src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + barrier(CLK_LOCAL_MEM_FENCE); + + // Looking for the next highest power of 2 (maximum value of lsize is 8) + unsigned int middle = lsize - 1; + middle |= middle >> 1; + middle |= middle >> 2; + middle += 1; + // Perform parallel reduction + DATA_TYPE_OUTPUT condition_check3; + for (unsigned int i = middle; i > 0; i >>= 1) + { + if (lid < i && lid + i < lsize) + { + DATA_TYPE tmp0 = *(src_in_row + local_results[lid]); + DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); +#if defined(ARG_MAX) + condition_check3 = + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); + local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); +#else // defined(ARG_MIN) + local_results[lid] = select( + local_results[lid], local_results[lid + i], + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); +#endif // defined(ARG_MAX) || defined(ARG_MIN) + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0]; + } + } +} +#endif // defined(WIDTH) + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. + * -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (unsigned int y = 1; y < HEIGHT; ++y) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = + CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, y, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif // defined(HEIGHT) + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, z, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, w, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */ +#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl new file mode 100644 index 000000000..e249663bc --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(OP_CODE) && defined(DATA_TYPE) +/** returns truth value of the two input tensors for BINARY LOGICAL OP. + * where BINARY LOGICAL OP can be AND, OR. + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=uchar + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. + * e.g. -DVEC_SIZE=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input1_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input1_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input1_step_x input1_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input1_step_y input1_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input1_step_z input1_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input2_ptr Pointer to the source tensor. + * Supported data types: QASYMM8 + * @param[in] input2_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] input2_step_x input2_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input2_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] input2_step_y input2_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input2_stride_z Stride of the source tensor in Z dimension + * (in bytes) + * @param[in] input2_step_z input2_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. + * Supported data types: QASYMM8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + */ +__kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATION(input2), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input1 = CONVERT_TO_TENSOR3D_STRUCT(input1); + Tensor3D input2 = CONVERT_TO_TENSOR3D_STRUCT(input2); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if OP_CODE == 1 // LOGICAL AND + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#elif OP_CODE == 2 // LOGICAL OR + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), + 0, (__global DATA_TYPE *)output.ptr); + +#else // OP NOT SUPPORTED + return + +#endif +} +#endif // if defined(OP_CODE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..3b0a175a4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function performs a up-scaling depth conversion for boolean type input. + * + * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note The integer shift amount value need to be passed at compile time using -DSHIFT: + * e.g. -DSHIFT=7 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] in_step_z in_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + */ +__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr); + + VSTORE(VEC_SIZE) + (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl new file mode 100644 index 000000000..92e5dfbee --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform embedding_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ + +__kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + // lookup ids for based on the tensor dimensions + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl new file mode 100644 index 000000000..2236021f1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) + +/** Performs the Gather operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] input_stride_z Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] input_stride_w Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension + * (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination + * tensor + */ +__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); +#endif +#elif AXIS == 1 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); +#endif +#elif AXIS == 2 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); +#endif +#elif AXIS == 3 +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); +#endif +#endif // AXIS + + *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl new file mode 100644 index 000000000..80ba73d1d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \ + defined(COLS_A) +#define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) +#define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X) +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B + * (src1) in case both matrices have not beed reshaped + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data type: + * QASYMM8 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data type: + * same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z, + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx; + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension + // in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + int end_row_vec_a = src_addr.s0 + COLS_A; + + VECTOR_INT acc0 = 0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VECTOR_INT acc1 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VECTOR_INT acc2 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VECTOR_INT acc3 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VECTOR_INT acc4 = 0; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + + for (; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y)) + { + // Load values from matrix A + char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + char2 a4 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + // Load values from matrix B + VECTOR_CHAR b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( + 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0; + acc0 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a0.s1; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1.s0; + acc1 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a1.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2.s0; + acc2 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a2.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3.s0; + acc3 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a3.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4.s0; + acc4 += CONVERT(b1, VECTOR_INT) * (VECTOR_INT)a4.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y)) + { + // Load values from matrix A + char a0 = *(__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + char a1 = *(__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + char a2 = *(__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + char a3 = *(__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + char a4 = *(__global char *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + // Load values from matrix B + VECTOR_CHAR b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + + // Accumulate + acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + acc4 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a4; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + } + + const int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension + // in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint8)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst.ptr += z * dst_stride_z * DEPTH_GEMM3D; + + // Store the result + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst.ptr += z * dst_stride_z; + + // Store the result + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 + VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X) + (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 +#endif // defined(REINTERPRET_OUTPUT_AS_3D) +} +#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && + // defined(COLS_A) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl new file mode 100644 index 000000000..a4f7dbd48 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) +/** Perform hashtable_lookup of input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DDEPTH_OUT=depth. e.g. -DDEPTH_OUT=16 + * @attention Number of input dimensions are passed as a preprocessor argument using + * -DNUM_DIMS=size, e.g. -DNUM_DIMS=4 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] lookups_ptr Pointer to the lookups vector. Supported data + * types: S32 + * @param[in] lookups_stride_x Stride of the lookups vector in X dimension (in + * bytes) + * @param[in] lookups_step_x lookups_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lookups_offset_first_element_in_bytes The offset of the first element in the lookups + * vector + */ +__kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + VECTOR_DECLARATION(lookups)) +{ + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, DEPTH_OUT); + + Vector lups = CONVERT_TO_VECTOR_STRUCT_NO_STEP(lookups); + + int lup_id[4] = {0}; + + lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) + : get_global_id(0); + lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) + : get_global_id(1); + lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) + : get_global_id(2) % DEPTH_OUT; + lup_id[3] = (NUM_DIMS == 4) + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; + + if (lup_id[NUM_DIMS - 1] < 0) + { + VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, 0, (__global DATA_TYPE *)out.ptr); + return; + } + + in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; + + VSTORE(VEC_SIZE) + (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, + (__global DATA_TYPE *)out.ptr); +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(NUM_DIMS) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 000000000..e07a25ec9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) +#pragma OPENCL EXTENSION cl_arm_printf : enable +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) + +#define GPU_ARCH_MIDGARD 0x100 +#define GPU_ARCH_BIFROST 0x200 + +/** Concatenate two inputs. + * + * @param[in] a The first input to be concatenated + * @param[in] b The second input to be concatenated + * + * @return The concatenated output + */ +#define CONCAT(a, b) a##b + +/** Expand the given vector + * + * @param[in] x The vector to be expanded + * + * @return The expanded output + */ +#define EXPAND(x) x + +/** Clamp the given value between an upper and lower bound. + * + * @param[in] x The value to be clamped + * @param[in] min_val The lower bound + * @param[in] max_val The upper bound + * + * @return The clamped value. + */ +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +/** REVn reverses the given vector whose size is n. + * @name REVn + * + * @param[in] x The vector to be reversed + * + * @return The reversed vector + * @{ + */ +#define REV1(x) ((x)) +#define REV2(x) ((x).s10) +#define REV3(x) ((x).s210) +#define REV4(x) ((x).s3210) +#define REV8(x) ((x).s76543210) +#define REV16(x) ((x).sFEDCBA9876543210) +/** @} */ // end of group REVn + +/** Reverse the given vector. + * @name REVERSE + * + * @param[in] x The vector to be reversed + * @param[in] s The size of the vector + * + * @return The reversed vector + * @{ + */ +#define REVERSE_STR(x, s) REV##s((x)) +#define REVERSE(x, s) REVERSE_STR(x, s) +/** @} */ // end of group REVERSE + +/** Circular-right-shift (rotate-right) the vector of size s by the amount of n. + * @name ROTs_n + * + * @param[in] x The vector to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROT1_0(x) ((x)) + +#define ROT2_0(x) ((x)) +#define ROT2_1(x) ((x).s10) + +#define ROT3_0(x) ((x)) +#define ROT3_1(x) ((x).s201) +#define ROT3_2(x) ((x).s120) + +#define ROT4_0(x) ((x)) +#define ROT4_1(x) ((x).s3012) +#define ROT4_2(x) ((x).s2301) +#define ROT4_3(x) ((x).s1230) + +#define ROT8_0(x) ((x)) +#define ROT8_1(x) ((x).s70123456) +#define ROT8_2(x) ((x).s67012345) +#define ROT8_3(x) ((x).s56701234) +#define ROT8_4(x) ((x).s45670123) +#define ROT8_5(x) ((x).s34567012) +#define ROT8_6(x) ((x).s23456701) +#define ROT8_7(x) ((x).s12345670) + +#define ROT16_0(x) ((x)) +#define ROT16_1(x) ((x).sF0123456789ABCDE) +#define ROT16_2(x) ((x).sEF0123456789ABCD) +#define ROT16_3(x) ((x).sDEF0123456789ABC) +#define ROT16_4(x) ((x).sCDEF0123456789AB) +#define ROT16_5(x) ((x).sBCDEF0123456789A) +#define ROT16_6(x) ((x).sABCDEF0123456789) +#define ROT16_7(x) ((x).s9ABCDEF012345678) +#define ROT16_8(x) ((x).s89ABCDEF01234567) +#define ROT16_9(x) ((x).s789ABCDEF0123456) +#define ROT16_10(x) ((x).s6789ABCDEF012345) +#define ROT16_11(x) ((x).s56789ABCDEF01234) +#define ROT16_12(x) ((x).s456789ABCDEF0123) +#define ROT16_13(x) ((x).s3456789ABCDEF012) +#define ROT16_14(x) ((x).s23456789ABCDEF01) +#define ROT16_15(x) ((x).s123456789ABCDEF0) +/** @} */ // end of group ROTs_n + +/** Circular-right-shift (rotate-right) the given vector by the given amount. + * @name ROTATE + * + * @param[in] x The vector to be shifted + * @param[in] s The size of the vector + * @param[in] n The amount to be shifted + * + * @return The shifted vector + * @{ + */ +#define ROTATE_STR(x, s, n) ROT##s##_##n(x) +#define ROTATE(x, s, n) ROTATE_STR(x, s, n) +/** @} */ // end of group ROTATE + +/** Creates a vector of size n filled with offset values corresponding to the location of each + * element. + * @name V_OFFSn + * + * @param[in] dt The data type of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define V_OFFS1(dt) (dt)(0) +#define V_OFFS2(dt) (dt)(0, 1) +#define V_OFFS3(dt) (dt)(0, 1, 3) +#define V_OFFS4(dt) (dt)(0, 1, 2, 3) +#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7) +#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +/** @} */ // end of group V_OFFSn + +/** Create a vector filled with offset values corresponding to the location of each element. + * @name VEC_OFFS + * + * @param[in] dt The data type of the output vector + * @param[in] s The size of the output vector + * + * @return The vector filled with offset values + * @{ + */ +#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) +#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) +/** @} */ // end of group VEC_OFFS + +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + +#define float1 float +#define half1 half +#define char1 char +#define uchar1 uchar +#define short1 short +#define ushort1 ushort +#define int1 int +#define uint1 uint +#define long1 long +#define ulong1 ulong +#define double1 double + +#define vload1(OFFSET, PTR) *(OFFSET + PTR) +#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA + +// Convert built-in functions with _sat modifier are not supported in floating point so we create +// defines +// without _sat to overcome this issue +#define convert_float_sat convert_float +#define convert_float1_sat convert_float +#define convert_float2_sat convert_float2 +#define convert_float3_sat convert_float3 +#define convert_float4_sat convert_float4 +#define convert_float8_sat convert_float8 +#define convert_float16_sat convert_float16 +#define convert_half_sat convert_float +#define convert_half1_sat convert_half +#define convert_half2_sat convert_half2 +#define convert_half3_sat convert_half3 +#define convert_half4_sat convert_half4 +#define convert_half8_sat convert_half8 +#define convert_half16_sat convert_half16 + +#define convert_float1 convert_float +#define convert_half1 convert_half +#define convert_char1 convert_char +#define convert_uchar1 convert_uchar +#define convert_short1 convert_short +#define convert_ushort1 convert_ushort +#define convert_int1 convert_int +#define convert_uint1 convert_uint +#define convert_long1 convert_long +#define convert_ulong1 convert_ulong +#define convert_double1 convert_double + +#define convert_char1_sat convert_char_sat +#define convert_uchar1_sat convert_uchar_sat +#define convert_short1_sat convert_short_sat +#define convert_ushort1_sat convert_ushort_sat +#define convert_int1_sat convert_int_sat +#define convert_uint1_sat convert_uint_sat +#define convert_long1_sat convert_long_sat +#define convert_ulong1_sat convert_ulong_sat +#define convert_double1_sat convert_double_sat + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CL_VEC_DATA_TYPE_STR(type, size) type##size +#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0) + +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z, name##_stride_w, name##_step_w, mod_size) + +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ + mod_size) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Structure to hold 4D tensor information */ +typedef struct Tensor4D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ + int stride_w; /**< Stride of the image in W dimension (in bytes) */ +} Tensor4D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x) +{ + Vector vector = { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * + * @return An image object + */ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, + uint step_y, uint stride_z, uint step_z) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z) +{ + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z, uint stride_w, uint step_w, uint mod_size) +{ + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + + (get_global_id(2) / mod_size) * step_w; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +inline __global const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +/** Get the pointer position of a Tensor4D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + * @param[in] w Relative W position + */ +inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + w * tensor->stride_w; +} + +#endif // _HELPER_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h new file mode 100644 index 000000000..5f1b3f902 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPERS_ASYMM_H +#define ARM_COMPUTE_HELPERS_ASYMM_H + +#include "helpers.h" + +/** Convert the given vector with round to nearest even rounding mode + * + * @param[in] x The target to be converted + * @param[in] type The target type + * + * @return The converted vector + */ +#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) + +/** Quantize a floating-point scalar value to 8-bit asymmetric + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline uchar quantize_qasymm8(float input, float offset, float scale) +{ + float out_f32 = input / scale + offset; + uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); + return res_u8; +} + +/** Dequantize a scalar value from 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8(uchar input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point + * + * @param[in] input Input value to quantize + * @param[in] offset Quantization offset + * @param[in] scale Quantization scale + * + * @return quantized value + */ +inline float dequantize_qasymm8_signed(char input, float offset, float scale) +{ + return ((float)input - offset) * scale; +} + +/** Quantize a vector of values from floating-point + * + * @param[in] type Output data type. + * @param[in] size Size of vector. + * + * @return quantized values + */ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ + VEC_DATA_TYPE(type, size)); \ + return res; \ + } + +/** Dequantize a vector of values to floating-point + * + * @param[in] type Input data type. + * @param[in] size Size of vector. + * + * @return dequantized values in floating point + */ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ + } + +/** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + { \ + const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ + const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ + VEC_DATA_TYPE(int, size) \ + mask = (one << exponent) - one; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + +/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), + * rounding to the nearest value, and saturating -1 * -1 to the maximum value. + * + * @param[in] size Size of vector. + * + * @return Product of two fixed-point numbers. + */ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ + } + +/** Each bit of the result is set to the corresponding bit of either then_val or + * else_val depending on whether the corresponding bit of if_mask is set. + * Equivalent to the VBSL instruction in ARM NEON. + * + * @param[in] size Size of vector. + * + * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding + * bit in @p if_mask is set or not. + */ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ + VEC_DATA_TYPE(int, size) then_val, \ + VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is zero. + */ +#define ASYMM_MASK_IF_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a == 0); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is non-zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is non zero. + */ +#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a != 0); \ + } + +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK( \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ + } + +/** Calculates \f$ exp(x) \f$ for x < 0. + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ + a_mod_quarter_minus_one_quarter_scaled, size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ + size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ + } + +/** Calculates the product of a integer value by a power of two, with either a positive exponent + * (equivalent to an arithmetic left shift, saturating) or a negative exponent + * (equivalent to an arithmetic right shift, rounding to nearest). + * + * @param[in] size Size of vector. + * + * @return Arithmetic left or right shift. + */ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ + } + +/** Calculates (a+b)/2, rounded to the nearest integer. + * Equivalent to VRHADD in the ARM NEON instruction set. + * + * @param[in] size Size of vector. + * + * @return (a+b)/2, rounded to the nearest integer. + */ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ + } + +/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ + VEC_DATA_TYPE(int, size) \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ + const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ + VEC_DATA_TYPE(int, size) \ + x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ + for (int i = 0; i < 3; i++) \ + { \ + VEC_DATA_TYPE(int, size) \ + half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ + VEC_DATA_TYPE(int, size) \ + one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ + VEC_DATA_TYPE(int, size) \ + tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ + x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ + } \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ + } + +/** Considering the integer value as fixed-point, change the number of integer bits and update value + * accordingly. + * + * @param[in] size Size of vector. + * + * @return Rescaled value. + */ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ + int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + +#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) +#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) +#define DEQUANTIZE_STR(input, offset, scale, type, size) \ + dequantize_##type##size(input, offset, scale) +#define DEQUANTIZE(input, offset, scale, type, size) \ + DEQUANTIZE_STR(input, offset, scale, type, size) + +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) +#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ + ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ + asymm_exp_on_negative_values##size(a, k_integer_bits) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ + } +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ + multiply_by_quantized_multiplier##size(input, qmul, shift) + +QUANTIZE_IMPL(uchar, 1) +QUANTIZE_IMPL(char, 1) +QUANTIZE_IMPL(uint, 1) +QUANTIZE_IMPL(int, 1) +QUANTIZE_IMPL(uchar, 4) +QUANTIZE_IMPL(ushort, 4) +QUANTIZE_IMPL(short, 4) +QUANTIZE_IMPL(uchar, 16) +QUANTIZE_IMPL(char, 16) +QUANTIZE_IMPL(ushort, 16) +QUANTIZE_IMPL(short, 16) +QUANTIZE_IMPL(uint, 16) +QUANTIZE_IMPL(int, 16) + +DEQUANTIZE_IMPL(uchar, 1) +DEQUANTIZE_IMPL(char, 1) +DEQUANTIZE_IMPL(uint, 1) +DEQUANTIZE_IMPL(int, 1) +DEQUANTIZE_IMPL(uchar, 4) +DEQUANTIZE_IMPL(ushort, 4) +DEQUANTIZE_IMPL(short, 4) +DEQUANTIZE_IMPL(uchar, 16) +DEQUANTIZE_IMPL(char, 16) +DEQUANTIZE_IMPL(ushort, 16) +DEQUANTIZE_IMPL(short, 16) +DEQUANTIZE_IMPL(uint, 16) +DEQUANTIZE_IMPL(int, 16) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + +ASYMM_MULT_IMPL(1) +ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(4) +ASYMM_MULT_IMPL(8) +ASYMM_MULT_IMPL(16) + +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + +ASYMM_SELECT_USING_MASK_IMPL(1) +ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(4) +ASYMM_SELECT_USING_MASK_IMPL(8) +ASYMM_SELECT_USING_MASK_IMPL(16) + +ASYMM_MASK_IF_ZERO_IMPL(1) +ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(4) +ASYMM_MASK_IF_ZERO_IMPL(8) +ASYMM_MASK_IF_ZERO_IMPL(16) + +ASYMM_MASK_IF_NON_ZERO_IMPL(1) +ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(4) +ASYMM_MASK_IF_NON_ZERO_IMPL(8) +ASYMM_MASK_IF_NON_ZERO_IMPL(16) + +EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(4) +EXP_BARREL_SHIFTER_IMPL(8) +EXP_BARREL_SHIFTER_IMPL(16) + +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) + +ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(4) +ASYMM_ROUNDING_HALF_SUM_IMPL(8) +ASYMM_ROUNDING_HALF_SUM_IMPL(16) + +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + +ASYMM_RESCALE_IMPL(1) +ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(4) +ASYMM_RESCALE_IMPL(8) +ASYMM_RESCALE_IMPL(16) + +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) +MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) + +#endif // ARM_COMPUTE_HELPERS_ASYMM_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl new file mode 100644 index 000000000..014842680 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) +/** This function normalizes the input 2D tensor across the first dimension with respect to mean and + * standard deviation of the same dimension. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. + * -DDATA_TYPE=float + * @attention Normalization epsilon parameter should be given as a preprocessor argument with + * -DEPSILON=value. e.g. -DEPSILON=0.001f + * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, + * -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr (Optional) Pointer to the destination tensor. + * Supported data types: same as @p input_ptr + * @param[in] output_stride_x (Optional) Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y (Optional) output_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z (Optional) output_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in + * the destination tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma tensor. + * Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma tensor in X + * dimension (in bytes) + * @param[in] gamma_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in + * the gamma tensor + * @param[in] beta_ptr (Optional) Pointer to the beta tensor. Supported + * data types: same as @p input_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta tensor in X + * dimension (in bytes) + * @param[in] beta_step_x (Optional) output_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in + * the beta tensor + */ +__kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR4D_DECLARATION(output) +#endif /* IN_PLACE */ +#ifdef GAMMA + , + VECTOR_DECLARATION(gamma) +#endif // GAMMA +#ifdef BETA + , + VECTOR_DECLARATION(beta) +#endif // BETA + ) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); +#ifndef IN_PLACE + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); +#endif /* IN_PLACE */ + + float sum = 0.f; + float sum_sq = 0.f; + +#if defined(NHWC) + + const int ch = get_global_id(0); // Current channel + const int batch = get_global_id(2); // Current batch + const int elements_plane = DIM_Y * DIM_Z; + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + float data = (float)*((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); + sum += data; + sum_sq += data * data; + } + } + +#else // !defined(NHWC) + const int ch = get_global_id(2) % DIM_Z; // Current channel + const int batch = get_global_id(2) / DIM_Z; // Current batch + const int elements_plane = DIM_X * DIM_Y; + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum = 0.f; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + part_sum_sq = 0.f; + // Calculate partial sum + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum += data; + part_sum_sq += data * data; + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)); + part_sum.s0 += data; + part_sum_sq.s0 += data * data; + } + } +// Perform reduction +#if VEC_SIZE > 8 + part_sum.s01234567 += part_sum.s89abcdef; + part_sum_sq.s01234567 += part_sum_sq.s89abcdef; +#endif // VEC_SIZE > 8 +#if VEC_SIZE > 4 + part_sum.s0123 += part_sum.s4567; + part_sum_sq.s0123 += part_sum_sq.s4567; +#endif // VEC_SIZE > 4 +#if VEC_SIZE > 2 + part_sum.s01 += part_sum.s23; + part_sum_sq.s01 += part_sum_sq.s23; +#endif // VEC_SIZE > 2 + part_sum.s0 += part_sum.s1; + part_sum_sq.s0 += part_sum_sq.s1; + + sum = (float)part_sum.s0; + sum_sq = (float)part_sum_sq.s0; + +#endif // defined(NHWC) + + const float mean_float = (sum / elements_plane); + const DATA_TYPE mean = (DATA_TYPE)mean_float; + const float var_float = (sum_sq / elements_plane) - (mean_float * mean_float); +#if defined(GAMMA) + const float multip_float = *((__global DATA_TYPE *)gamma_ptr + ch) / sqrt(var_float + EPSILON); + const DATA_TYPE multip = (DATA_TYPE)multip_float; +#else // !defined(GAMMA) + const DATA_TYPE multip = (DATA_TYPE)0; +#endif // defined(GAMMA) +#if defined(BETA) + const DATA_TYPE beta = *((__global DATA_TYPE *)beta_ptr + ch); +#else // !defined(BETA) + const DATA_TYPE beta = 0; +#endif // defined(BETA) + +#if defined(NHWC) + + for (int i_w = 0; i_w < DIM_Y; ++i_w) + { + for (int i_h = 0; i_h < DIM_Z; ++i_h) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } + +#else // !defined(NHWC) + for (int y = 0; y < DIM_Y; ++y) + { + int x = 0; + for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = VLOAD(VEC_SIZE)(0, input_address); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = (data - mean) * multip + beta; + VSTORE(VEC_SIZE) + (res, 0, output_address); + } + // Left-overs loop + for (; x < DIM_X; ++x) + { + __global DATA_TYPE *input_address = + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); +#ifdef IN_PLACE + __global DATA_TYPE *output_address = input_address; +#else /* !IN_PLACE */ + __global DATA_TYPE *output_address = + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); +#endif /* IN_PLACE */ + *(output_address) = (*(input_address)-mean) * multip + beta; + } + } +#endif // defined(NHWC) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ + defined(DIM_Y) && defined(DIM_Z) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl new file mode 100644 index 000000000..3943fc4c2 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE) + +/** This performs to multiply input by scale_factor. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Quantization scale of input tensor is passed in with -DSCALE=scale. + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S8 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] scale_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] scale_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] scale_step_x scale_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] scale_offset_first_element_in_bytes The offset of the first element in the scale + * tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale), + IMAGE_DECLARATION(output), float multiplier) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global int *)input.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + + // Create scale vector + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + vscale = *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)); + + // Dequantize + vscale *= (DATA_TYPE)(multiplier); + val *= vscale; + + // Store result + VSTORE(VEC_SIZE) + (val, 0, (__global DATA_TYPE *)output.ptr); +#else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE *)(output.ptr)) = + ((DATA_TYPE)(*((__global int *)(input.ptr)))) * + *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) +} + +#endif // defined(VEC_SIZE) && defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl new file mode 100644 index 000000000..15c16f80c --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/neg_tensor.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#ifndef VEC_SIZE +#define VEC_SIZE 1 +#endif + +#if defined(DATA_TYPE) +/** Performs a negation of input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * S16/S32/F16/F32. + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination + * image + * + */ +__kernel void neg_tensor(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE) + (-VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(DATA_TYPE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl new file mode 100644 index 000000000..c274aba62 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) + +/** Performs the OneHot operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[in] off_value_ptr Pointer to the off_value vector. Supported + * data types: Same as @p on_value. + * @param[in] off_value_stride_x Stride of the off_value vector in X + * dimension (in bytes) + * @param[in] off_value_step_x off_value_stride_x * number of elements + * along X processed per work item (in bytes) + * @param[in] off_value_offset_first_element_in_bytes Offset of the first element in the off_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 + const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw); + *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#endif // AXIS +} + +/** Performs the OneHot operation along the chosen axis as off_value being zero + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2); + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z); + + const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz); + + if (index < 0 || index >= DEPTH) + return; + +#if AXIS == 0 + *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 1 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 2 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 3 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) = + *((__global const DATA_TYPE *)on_value_ptr); +#endif // AXIS +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl new file mode 100644 index 000000000..76fda9041 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers_asymm.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of + * GEMMLowp to QASYMM8 + * + * The following computations will be performed by the kernel: + * + * -# Add offset terms to inputs + * -# Multiply inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using + * -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and + * -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and + * -DRESULT_SHIFT + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in + * bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + + // Multiply with a multiplier smaller than 1 + out_val = + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + + // TODO: Apply min-max BOUND to support fuse with relu. + /* + #if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); + #endif // defined(MIN_BOUND) + #if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); + #endif // defined(MAX_BOUND) + */ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl new file mode 100644 index 000000000..4ae9adb0b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x))) +#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size) +#define MIN_QUANT_VAL -127 +#define MAX_QUANT_VAL 127 + +#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) + +/** This performs the quantization of floating point inputs to 8-bit unsigned integers. + * + * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. + * -DDATA_TYPE=short + * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. + * e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. + * -DSCALE=0.125 + * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. + * -DOFFSET=125 + * @note Minimum value for quantized type should be given as a preprocessor argument using + * -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0 + * @note Maximum value for quantized type should be given as a preprocessor argument using + * -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: S8 + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[out] scale_ptr Pointer to the scale tensor. Supported data + * types: F32 + * @param[in] scale_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] scale_step_x scale_stride_x * number of elements along X + * processed per workitem(in bytes) + */ +__kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(scale), + IMAGE_DECLARATION(output)) +{ + // Get pixels pointer + Image input = CONVERT_TO_IMAGE_STRUCT(input); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + +#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + + // Create scale vector + const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = + *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); + + // Quantize + VEC_DATA_TYPE(int, VEC_SIZE) + res = CLAMP(CONVERT_RTE_VEC(val / vscale, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); +#else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP( + CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / + (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), + int), + MIN_QUANT_VAL, MAX_QUANT_VAL); +#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl new file mode 100644 index 000000000..832ac1270 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) +/** Perform reduce max/min + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE value = + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + for (int i = 1; i < dim; ++i) + { + indices[axis] = i; + +#if OP_CODE == 1 // REDUCE_MAX + value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#elif OP_CODE == 2 // REDUCE_MIN + value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], + indices[2], indices[3]))); + +#else // OP NOT SUPPORTED + return; + +#endif + } + + *((__global DATA_TYPE *)out.ptr) = value; +} + +/** Perform reduce sum/mean + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. + * e.g. -DDEPTH_OUT=16 + * @attention Operation type(code) specifying which operation to perform should be passed as + * preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data + * types: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[out] output_ptr Pointer to the destination image. Supported data + * types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination image + * @param[in] axis Axis through which reduction occurs + * @param[in] dim Dimension across the axis to be reduced. + */ +__kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), + const int axis, const int dim) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + + int indices[4] = { + get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, + }; + + DATA_TYPE sum_value = (DATA_TYPE)0; + for (int i = 0; i < dim; ++i) + { + indices[axis] = i; + sum_value += *( + (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + } + +#if OP_CODE == 3 // REDUCE_SUM + *((__global DATA_TYPE *)out.ptr) = sum_value; + +#elif OP_CODE == 4 // REDUCE_MEAN + *((__global DATA_TYPE *)out.ptr) = sum_value / CONVERT(dim, DATA_TYPE); + +#else // OP NOT SUPPORTED + return; + +#endif +} +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl new file mode 100644 index 000000000..3d5e90356 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/scale_factor.cl @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if defined(WIDTH) +/** This function identifies the min and maximum value of an input 3D tensor. + * + * @note The width, height and depth of the input tensor must be provided at compile time using + * -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: + * F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] dst_ptr Pointer to the min/max vector. Minimum value in + * position 0, maximum value in position 1. Supported data types: F32. + * @param[in] dst_stride_x Stride of the min/max vector in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max + * vector + */ +__kernel void scale_factor_symm8(IMAGE_DECLARATION(src), VECTOR_DECLARATION(dst)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + + float4 min_value = (float4)FLT_MAX; + float4 max_value = (float4)-FLT_MAX; + + int x = 0; + __global float *src_addr = (__global float *)(src.ptr); + + for (; x <= (int)(WIDTH - 8); x += 8) + { + float8 value = vload8(0, (__global float *)(src_addr + x)); + + min_value = select(value.s0123, min_value, min_value < value.s0123); + min_value = select(value.s4567, min_value, min_value < value.s4567); + + max_value = select(value.s0123, max_value, max_value > value.s0123); + max_value = select(value.s4567, max_value, max_value > value.s4567); + } + + for (; x < WIDTH; ++x) + { + float value = *(src_addr + x); + + min_value.s0 = min(min_value.s0, value); + max_value.s0 = max(max_value.s0, value); + } + + // Perform min/max reduction + min_value.s01 = min(min_value.s01, min_value.s23); + min_value.s0 = min(min_value.s0, min_value.s1); + max_value.s01 = max(max_value.s01, max_value.s23); + max_value.s0 = max(max_value.s0, max_value.s1); + + // Extract scale + max_value.s0 = max(fabs(min_value.s0), fabs(max_value.s0)) / 127.0f; + + // Store min and max + *((__global float *)(dst_ptr) + get_global_id(1)) = max_value.s0; +} +#endif // defined(WIDTH) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl new file mode 100644 index 000000000..3eb1a4ce7 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__kernel void topkv2_init(VECTOR_DECLARATION(input), __global float *in_key_buf, + __global int *in_ind_buf, const int n) +{ + int gid = get_global_id(0); + int lws = get_local_size(0); + int groups = get_num_groups(0); + int gws = lws * groups; + int iter = n / gws; + + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + for (int i = 0; i < iter; ++i) + { + int idx = i * gws + gid; + in_key_buf[idx] = *(__global float *)(input.ptr + idx * input.stride_x); + in_ind_buf[idx] = idx; + } +} + +__kernel void topkv2_find_first_negative(__global float *out_key_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + if (gid == n - 1) + { + // if the last item is positive, the first negative index is n. + if (out_key_buf[gid] > 0.f) + *first_negative_idx = n; + } + else if (gid == 0) + { + // if the first item is negative, set it 0. + if (out_key_buf[gid] < 0.f) + *first_negative_idx = 0; + } + else + { + // if its left is positive and it is negative, then it is the first negative item. + if (out_key_buf[gid - 1] > 0.f && out_key_buf[gid] < 0.f) + *first_negative_idx = gid; + } +} + +__kernel void topkv2_reorder_negatives(__global float *in_key_buf, __global float *out_key_buf, + __global float *in_ind_buf, __global float *out_ind_buf, + __global int *first_negative_idx, int n) +{ + int gid = get_global_id(0); + + int num_negs = n - *first_negative_idx; + int in_idx; + + if (gid < num_negs) + { + in_idx = n - 1 - gid; + } + else + { + in_idx = gid - num_negs; + } + + out_key_buf[gid] = in_key_buf[in_idx]; + out_ind_buf[gid] = in_ind_buf[in_idx]; +} + +__kernel void topkv2_store(VECTOR_DECLARATION(values), VECTOR_DECLARATION(indices), + __global float *out_key_buf, __global int *out_ind_buf, int n) +{ + int gid = get_global_id(0); + + Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); + Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + + int idx = n - 1 - gid; + + *(__global float *)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int *)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl new file mode 100644 index 000000000..460de790b --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__global inline float *get_vec_elem(Vector *vec, int idx) +{ + return (__global float *)(vec->ptr + idx * vec->stride_x); +} + +__global inline int *get_vec_elem_int(Vector *vec, int idx) +{ + return (__global int *)(vec->ptr + idx * vec->stride_x); +} + +// A utility function to swap two elements +void swap(__global float *a, __global float *b) +{ + float t = *a; + *a = *b; + *b = t; +} + +void swap_idx(__global int *a, __global int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* This function is same in both iterative and recursive*/ +int partition(Vector *arr, __global int *indices, int l, int h) +{ + float x = *get_vec_elem(arr, h); + int i = (l - 1); + + for (int j = l; j <= h - 1; j++) + { + if (*get_vec_elem(arr, j) >= x) + { + i++; + swap(get_vec_elem(arr, i), get_vec_elem(arr, j)); + swap_idx(&indices[i], &indices[j]); + } + } + swap(get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); +} + +/* A[] --> Array to be sorted, + l --> Starting index, + h --> Ending index */ +void quickSortIterative(Vector *arr, __global int *indices, __global int *stack, int l, int h) +{ + // Create an auxiliary stack + + // initialize top of stack + int top = -1; + + // push initial values of l and h to stack + stack[++top] = l; + stack[++top] = h; + + // Keep popping from stack while is not empty + while (top >= 0) + { + // Pop h and l + h = stack[top--]; + l = stack[top--]; + + // Set pivot element at its correct position + // in sorted array + int p = partition(arr, indices, l, h); + + // If there are elements on left side of pivot, + // then push left side to stack + if (p - 1 > l) + { + stack[++top] = l; + stack[++top] = p - 1; + } + + // If there are elements on right side of pivot, + // then push right side to stack + if (p + 1 < h) + { + stack[++top] = p + 1; + stack[++top] = h; + } + } +} + +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), VECTOR_DECLARATION(topk_values), + VECTOR_DECLARATION(topk_indices), __global int *indices, + __global int *temp_stack, int k, int n) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); + Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); + + for (int i = 0; i < n; ++i) + { + indices[i] = i; + } + + quickSortIterative(&input, indices, temp_stack, 0, n - 1); + + // extract k items. + for (int i = 0; i < k; ++i) + { + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem_int(&topk_indices, i) = indices[i]; + } +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl new file mode 100644 index 000000000..e9d4696b4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// reference: +// https://code.google.com/archive/p/ocl-radix-sort/source/default/source +// OpenCL kernel sources for the CLRadixSort class +// the #include does not exist in OpenCL +// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr +// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html +// if you find this software usefull you can cite the following work in your reports or articles: +// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. +// http://hal.archives-ouvertes.fr/hal-00596730 + +// Reference for floating point radix sort: +// http://www.codercorner.com/RadixSortRevisited.htm + +// compute the histogram for each radix and each virtual processor for the pass +__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, + const int pass, __local int *loc_histo, const int n) +{ + int it = get_local_id(0); // i local number of the processor + int ig = get_global_id(0); // global number = i + g I + + int gr = get_group_id(0); // g group number + + int groups = get_num_groups(0); + int items = get_local_size(0); + + // set the local histograms to zero + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // range of keys that are analyzed by the work item + int size = n / groups / items; // size of the sub-list + int start = ig * size; // beginning of the sub-list + + unsigned int key; + int shortkey, k; + + // compute the index + // the computation depends on the transposition + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + + key = *((__global unsigned int *)(in_key_buf + k)); + + // extract the group of _BITS bits of the pass + // the result is in the range 0.._RADIX-1 + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + // increment the local histogram + loc_histo[shortkey * items + it]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // copy the local histogram to the global one + for (int ir = 0; ir < _RADIX; ir++) + { + d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// initial transpose of the list for improving +// coalescent memory access +__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, + const int nbrow, const __global int *inperm, __global int *outperm, + __local int *blockmat, __local int *blockperm, const int tilesize) +{ + + int i0 = get_global_id(0) * tilesize; // first row index + int j = get_global_id(1); // column index + + int jloc = get_local_id(1); // local column index + + // fill the cache + for (int iloc = 0; iloc < tilesize; iloc++) + { + int k = (i0 + iloc) * nbcol + j; // position in the matrix + blockmat[iloc * tilesize + jloc] = invect[k]; +#ifdef PERMUT + blockperm[iloc * tilesize + jloc] = inperm[k]; +#endif + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first row index in the transpose + int j0 = get_group_id(1) * tilesize; + + // put the cache at the good place + for (int iloc = 0; iloc < tilesize; iloc++) + { + int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose + outvect[kt] = blockmat[jloc * tilesize + iloc]; +#ifdef PERMUT + outperm[kt] = blockperm[jloc * tilesize + iloc]; +#endif + } +} + +// each virtual processor reorders its data using the scanned histogram +__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, + __global int *d_Histograms, const int pass, + __global int *indices_in, __global int *indices_out, + __local int *loc_histo, const int n) +{ + + int it = get_local_id(0); + int ig = get_global_id(0); + + int gr = get_group_id(0); + int groups = get_num_groups(0); + int items = get_local_size(0); + + int start = ig * (n / groups / items); + int size = n / groups / items; + + // take the histogram in the cache + for (int ir = 0; ir < _RADIX; ir++) + { + loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int newpos, shortkey, k, newpost; + unsigned int key; + + for (int j = 0; j < size; j++) + { +#ifdef TRANSPOSE + k = groups * items * j + ig; +#else + k = j + start; +#endif + float org_value = in_key[k]; + key = *(__global unsigned int *)(in_key + k); + shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); + + newpos = loc_histo[shortkey * items + it]; + +#ifdef TRANSPOSE + int ignew, jnew; + ignew = newpos / (n / groups / items); + jnew = newpos % (n / groups / items); + newpost = jnew * (groups * items) + ignew; +#else + newpost = newpos; +#endif + + // d_outKeys[newpost]= key; // killing line !!! + out_key[newpost] = org_value; + +#ifdef PERMUT + indices_out[newpost] = indices_in[k]; +#endif + + newpos++; + loc_histo[shortkey * items + it] = newpos; + } +} + +// perform a parallel prefix sum (a scan) on the local histograms +// (see Blelloch 1990) each workitem worries about two memories +// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, + __global int *globsum) +{ + int it = get_local_id(0); + int ig = get_global_id(0); + int decale = 1; + int n = get_local_size(0) * 2; + int gr = get_group_id(0); + + // load input into local memory + // up sweep phase + temp[2 * it] = histo[2 * ig]; + temp[2 * it + 1] = histo[2 * ig + 1]; + + // parallel prefix sum (algorithm of Blelloch 1990) + for (int d = n >> 1; d > 0; d >>= 1) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + temp[bi] += temp[ai]; + } + decale *= 2; + } + + // store the last element in the global sum vector + // (maybe used in the next step for constructing the global scan) + // clear the last element + if (it == 0) + { + globsum[gr] = temp[n - 1]; + temp[n - 1] = 0; + } + + // down sweep phase + for (int d = 1; d < n; d *= 2) + { + decale >>= 1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (it < d) + { + int ai = decale * (2 * it + 1) - 1; + int bi = decale * (2 * it + 2) - 1; + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write results to device memory + + histo[2 * ig] = temp[2 * it]; + histo[2 * ig + 1] = temp[2 * it + 1]; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// use the global sum for updating the local histograms +// each work item updates two values +__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) +{ + int ig = get_global_id(0); + int gr = get_group_id(0); + + int s; + + s = globsum[gr]; + + // write results to device memory + histo[2 * ig] += s; + histo[2 * ig + 1] += s; + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp new file mode 100644 index 000000000..047004d5e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +constexpr unsigned int vector_size = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, + DataType::S64); + } + if (prev_output != nullptr && prev_output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, + DataType::S32, DataType::S64); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output); + } + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *prev_output, + ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_UNUSED(op); + // Output tensor auto initialization if not yet initialized + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(axis, 1); + DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + Window win = calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), + Steps(vector_size)); + bool window_changed = false; + + switch (axis) + { + case 0: + { + ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input; + AccessWindowStatic input_access(input_tensor_access, 0, 0, + static_cast<int>(input_tensor_access->dimension(0)), 1); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + case 1: + case 2: + case 3: + { + AccessWindowHorizontal input_access(input, 0, vector_size); + AccessWindowHorizontal output_access(output, 0, vector_size); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_tuple(err, win); +} +} // namespace + +CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() + : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), + _op(ReductionOperation::ARG_IDX_MAX) +{ +} + +void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output, + ICLTensor *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, + output->info(), axis, op)); + auto win_config = validate_and_configure_window( + input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, + op); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + _input = input; + _prev_output = prev_output; + _output = output; + _reduction_axis = axis; + _op = op; + + // Set build options + CLBuildOptions build_opts; + + build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT"); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); + build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); + build_opts.add_option("-DDATA_TYPE_OUTPUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_SELECT=" + + get_cl_signed_type_from_element_size(input->info()->element_size())); + + // Create kernel + cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); + std::string kernel_axis_name; + switch (axis) + { + case 0: + { + const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input; + build_opts.add_option("-DWIDTH=" + + support::cpp11::to_string(input_for_width->info()->dimension(0))); + + kernel_axis_name = "x"; + lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), + vector_size); + } + break; + case 1: + build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + kernel_axis_name = "y"; + break; + case 2: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + kernel_axis_name = "z"; + break; + case 3: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); + kernel_axis_name = "w"; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); + + // Configure kernel window + ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); +} + +Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, + output->clone().get(), axis, op))); + return Status{}; +} + +void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + switch (_reduction_axis) + { + case 0: + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Reshape window + const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2; + + // Set local sums buffer + unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + if (_prev_output != nullptr) + { + add_2D_tensor_argument(idx, _prev_output, in_slice); + } + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + break; + case 1: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), + _input->info()->dimension(1))); + Window in_slice = window_in.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_2D(in_slice) && + window.slide_window_slice_2D(out_slice)); + } + break; + case 2: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), + _input->info()->dimension(2))); + Window in_slice = window_in.first_slice_window_3D(); + Window out_slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_3D(in_slice) && + window.slide_window_slice_3D(out_slice)); + } + break; + case 3: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(3, Window::Dimension(0, 1, 1)); + Window in_slice = window_in.first_slice_window_4D(); + Window out_slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, in_slice); + add_4D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_4D(in_slice) && + window.slide_window_slice_4D(out_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp new file mode 100644 index 000000000..fbc76f5e1 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + } + return Status{}; +} +} // namespace + +CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, BinaryLogicalOperation op) +{ + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_parameters(input1->info(), input2->info(), output->info())); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Create kernel + std::string kernel_name = "binary_logical_op"; + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()))); + + int op_code = 0; + switch (op) + { + case BinaryLogicalOperation::AND: + op_code = 1; + break; + case BinaryLogicalOperation::OR: + op_code = 2; + break; + default: + throw std::runtime_error("Operation not supported, yet"); + } + + build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + + const ValidRegion &valid_region = broadcast_pair.second; + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1->info()); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2->info()); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + ICLKernel::configure_internal(win); +} + +void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLBinaryLogicalOpKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp new file mode 100644 index 000000000..6e0bcde7f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "support/StringSupport.h" + +#include <cstddef> +#include <set> +#include <string> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::U32, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), + "Input and output data types must be different"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Get number of elements to process per iterations + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + const std::string kernel_name = "cast_bool"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); + + // Collapse window + const Window &full_window = window(); + Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ); + ICLKernel::configure_internal(collapsed_window); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(output->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); +} + +Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..67aaf2db6 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() + : _input(nullptr), _output(nullptr), _lookups(nullptr) +{ +} + +Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + return Status{}; +} + +void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "embedding_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLEmbeddingLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_in); + add_1D_tensor_argument(idx, _lookups, win_lookup); + + enqueue(queue, *this, slice_in); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp new file mode 100644 index 000000000..3bfe3e407 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/core/UtilsEx.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ + +inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, + ITensorInfo *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); + std::unique_ptr<ITensorInfo> output_info = input->clone(); + output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), actual_axis)); + // Output auto initialization if not yet initialized + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); + + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} + +} // namespace + +CLGatherExKernel::CLGatherExKernel() + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) +{ +} + +void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indices, + ICLTensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), indices->info(), output->info(), axis)); + + // Configure kernel window + auto win_config = + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input = input; + _output = output; + _indices = indices; + _axis = wrap_around(axis, static_cast<int>(input->info()->num_dimensions())); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} + +Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + indices->clone().get(), + output->clone().get(), axis) + .first); + return Status{}; +} + +void CLGatherExKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp new file mode 100644 index 000000000..930e7c944 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + input_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLHashtableLookupKernel::CLHashtableLookupKernel() +{ + // DO NOTHING +} + +Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Output's shape was not set"); + + ARM_COMPUTE_ERROR_ON(lookups->dimension(0) != hits->dimension(0) || + output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + + return Status{}; +} + +void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Make _lookup_indices tensor + _lookup_indices = support::cpp14::make_unique<CLTensor>(); + _lookup_indices->allocator()->init( + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + _lookup_indices->allocator()->allocate(); + + // Set kernel build options + std::stringstream kernel_name; + std::set<std::string> build_opts; + kernel_name << "hashtable_lookup"; + + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const_cast<ICLTensor *>(_lookups)->map(queue); + const_cast<ICLTensor *>(_keys)->map(queue); + _hits->map(queue); + _lookup_indices->map(queue); + + // Set values of hits + const int32_t *lookups_buf = + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); + uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); + int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); + + std::map<int32_t, size_t> key_map; + const size_t keys_num = _keys->info()->dimension(0); + for (size_t key_index = 0; key_index < keys_num; key_index++) + { + key_map[keys_buf[key_index]] = key_index; + } + + const size_t lookups_num = _lookups->info()->dimension(0); + for (size_t i = 0; i < lookups_num; ++i) + { + const auto lookup_value = lookups_buf[i]; + const auto it = key_map.find(lookup_value); + if (it != key_map.end()) + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= lookups_num) + ARM_COMPUTE_ERROR("HashTable Lookup: index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices_buf[i] = static_cast<int32_t>(it->second); + hits_buf[i] = static_cast<uint8_t>(1); + } + else + { + lookup_indices_buf[i] = -1; + hits_buf[i] = static_cast<uint8_t>(0); + } + } + + const_cast<ICLTensor *>(_lookups)->unmap(queue); + const_cast<ICLTensor *>(_keys)->unmap(queue); + _hits->unmap(queue); + _lookup_indices->unmap(queue); + + Window win = window.collapse(ICLKernel::window(), 2, 4); + + Window win_lookup; + win_lookup.set(Window::DimX, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, win); + add_4D_tensor_argument(idx, _output, win); + add_1D_tensor_argument(idx, _lookup_indices.get(), win_lookup); + + enqueue(queue, *this, win); + } while (window.slide_window_slice_4D(win) && window.slide_window_slice_1D(win_lookup)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..61c14d271 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_UNUSED(gamma); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // CLInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) +{ +} + +void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + _run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), + gamma ? gamma->info() : nullptr, + beta ? beta->info() : nullptr, epsilon)); + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1))); + build_opts.add_option("-DDIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon)); + build_opts.add_option_if(gamma, "-DGAMMA"); + build_opts.add_option_if(beta, "-DBETA"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option_if(_input->info()->data_layout() == DataLayout::NHWC, "-DNHWC"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void CLInstanceNormalizationLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window collapsed_window = window.collapse(window, Window::DimZ); + + // We will process the planes together + if (_input->info()->data_layout() == DataLayout::NCHW) + { + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + } + else + { + collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + collapsed_window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(3), 1)); + } + + Window vec_window; + vec_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, collapsed_window); + if (!_run_in_place) + { + add_4D_tensor_argument(idx, _output, collapsed_window); + } + if (_gamma) + { + add_1D_tensor_argument(idx, _gamma, vec_window); + } + if (_beta) + { + add_1D_tensor_argument(idx, _beta, vec_window); + } + + enqueue(queue, *this, collapsed_window, lws_hint()); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..6b27c9917 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); + + // CLMultiplyScaleFactorKernel doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_tuple(Status{}, win); +} +} // namespace + +CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor, + ICLTensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + const int vec_size_x = 16 / output->info()->element_size(); + const int output_width_x = output->info()->tensor_shape().x(); + const bool multi_access_x = (output_width_x / vec_size_x > 0); + + // Create and update the window (if needed) + Window win = calculate_max_window(*output->info()); + if (multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), + vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); +} + +Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + return Status{}; +} + +void CLMultiplyScaleFactorKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + + // Set scale_factor window + Window win_scale = calculate_max_window(*_scale_factor->info(), Steps()); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _scale_factor, win_scale); + add_2D_tensor_argument(idx, _output, slice); + _kernel.setArg<float>(idx++, _multiplier); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp new file mode 100644 index 000000000..643c8b110 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + return Status{}; +} + +} // namespace + +CLNegKernel::CLNegKernel() : _input(nullptr), _output(nullptr) {} + +void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Create kernel + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure_internal(win); +} + +void CLNegKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp new file mode 100644 index 000000000..35d70d689 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "support/StringSupport.h" +#include <string> +namespace arm_compute +{ +namespace +{ +inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, + const ITensorInfo *on_value, + ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + auto_init_if_empty((*output), output_shape, 1, on_value->data_type()); + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace +CLOneHotKernel::CLOneHotKernel() + : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), + _is_off_value_memset(false) +{ +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = false; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info()); + ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + _off_value = off_value; + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = true; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output); + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); + // Configure kernel window + auto win_config = + validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + if (_is_off_value_memset) + { + // Replace window with calculated by infices info + win_config.second = calculate_max_window(*indices->info(), Steps()); + } + _indices = indices; + _on_value = on_value; + _output = output; + const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size( + data_size_from_type(on_value->info()->data_type()))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth)); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + // Create kernel + const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value); + ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + unsigned int idx = 0; + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_1D_tensor_argument(idx, _on_value, window_collapsed); + if (!_is_off_value_memset) + { + add_1D_tensor_argument(idx, _off_value, window_collapsed); + } + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..1a7a18cfa --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, scale_factor); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Output must always be initialized + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); + + const int vec_size_x = 16 / input->element_size(); + const int input_width_x = input->tensor_shape().x(); + const bool multi_access_x = (input_width_x / vec_size_x > 0); + + if (multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), + vec_size_x)); + } + + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +} // namespace + +CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr) +{ +} + +void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLTensor *scale_factor, + ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + + const int vec_size_x = 16 / input->info()->element_size(); + const int input_width_x = input->info()->tensor_shape().x(); + const bool multi_access_x = (input_width_x / vec_size_x > 0); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); +} + +Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get()).first); + + return Status{}; +} + +void CLQuantizationSymmetricKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Support only 2D + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + + do + { + Window scale_slice = slice.shift_dimensions(1); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _scale_factor, scale_slice); + add_2D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp new file mode 100644 index 000000000..3fbebf25a --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/StringSupport.h" + +using namespace arm_compute; +namespace +{ +// NOTE This is necessary because it is not guaranteed that the axis positions of input and output +// are the same. +const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis) +{ + TensorShape out_shape{input_shape}; + + out_shape.set(axis, 1); + + return out_shape; +} +} // namespace + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, + ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32, DataType::S32); + if (op == ReductionOperation::SUM) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, + "Not support QASYMM8, yet"); + } + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + const auto num_dimensions = input->tensor_shape().num_dimensions(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank)."); + + const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(), + "output shape's size does not match axis"); + + return Status{}; +} +} // namespace + +CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} + +void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, + const uint32_t axis, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); + + _input = input; + _output = output; + _axis = axis; + + std::unique_ptr<ITensorInfo> output_info = output->info()->clone(); + output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis)); + + // Construct kernel name + std::string kernel_name; + int op_code = 0; + if (op == ReductionOperation::MAX) + { + kernel_name = "reduce_min_max"; + op_code = 1; + } + else if (op == ReductionOperation::MIN) + { + kernel_name = "reduce_min_max"; + op_code = 2; + } + else if (op == ReductionOperation::SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 3; + } + else if (op == ReductionOperation::MEAN_SUM) + { + kernel_name = "reduce_sum_mean"; + op_code = 4; + } + else + throw std::runtime_error("Operation not supported, yet"); + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type())); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2))); + build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*output_info, Steps()); + + Coordinates coord; + coord.set_num_dimensions(output_info->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape())); + + ICLKernel::configure_internal(win); +} + +Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const uint32_t axis, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); + + return Status{}; +} + +void CLReduceOperationKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &shape_in = _input->info()->tensor_shape(); + + unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters + + _kernel.setArg<cl_int>(idx++, _axis); + _kernel.setArg<cl_int>(idx++, shape_in[_axis]); + + // Support dimensions up to 4 + Window slice_out = window.collapse(ICLKernel::window(), 2, 4); + + // Setup input slice + Window slice_in(slice_out); + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_in.set(3, Window::Dimension(0, 0, 0)); + + // Copy output's shape in order to use for recovering at end of this method + // TODO Remove changing and recovering output's shape if it is guaranteed that the axis positions + // of input and output are the same + const TensorShape shape_out = _output->info()->tensor_shape(); + _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis)); + + idx = 0; + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + + // Recover output's shape of output tensor + _output->info()->set_tensor_shape(shape_out); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp new file mode 100644 index 000000000..8d8853c81 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "support/StringSupport.h" + +#include <climits> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + TensorShape output_shape = TensorShape{input->dimension(1)}; + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + TensorShape output_shape = TensorShape{input->dimension(1)}; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, output_shape, 1, input->data_type()); + + const unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output, 0, 0, output->dimension(0), 1); + + bool window_changed = update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_tuple(err, win); +} +} // namespace + +CLScaleFactorSymm8Kernel::CLScaleFactorSymm8Kernel() : _input(nullptr), _output(nullptr) {} + +void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + std::set<std::string> build_opts; + build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); + + auto win_config = validate_and_configure_window(input->info(), output->info()); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + ICLKernel::configure_internal(std::get<1>(win_config)); +} + +Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + + return Status{}; +} + +void CLScaleFactorSymm8Kernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_2D(); + slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + do + { + Window output_slice = slice.shift_dimensions(1); + + unsigned int idx = 0; + // Set inputs + add_2D_tensor_argument(idx, _input, slice); + add_1D_tensor_argument(idx, _output, output_slice); + enqueue(queue, *this, slice, lws_hint()); + } while (window_collapsed.slide_window_slice_2D(slice)); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..151d45e8d --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure_internal(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure_internal(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute +#endif // Disable GPU implementation diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp new file mode 100644 index 000000000..dfe5d59b0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" + +#include <algorithm> +#include "arm_compute/core/Types.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +namespace +{ + +using namespace arm_compute; +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op_templ( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, + OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, + OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, + non_broadcast_input_ptr, broadcast_value, + output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = + (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, + [&](const Coordinates &) { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = + reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = + reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, + input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +} // namespace + +namespace arm_compute +{ + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + float (*scalar_func)(const float &, const float &), + int (*broadcast_func)(int, int, int, const float *, const float &, float *, + const bool), + int (*neon_func)(int, int, int, const float *, const float *, float *)) +{ + elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} + +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), + int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, + uint8_t *, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) +{ + elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, + broadcast_func, neon_func); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp new file mode 100644 index 000000000..32d7d6237 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include <algorithm> +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace arm_compute +{ + +template <BinaryLogicalOperation op, typename ScalarType> +inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case BinaryLogicalOperation::AND: + res = a & b; + break; + case BinaryLogicalOperation::OR: + res = a | b; + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op, typename VectorType> +inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) +{ + VectorType res = {0, 0, 0, 0}; + + switch (op) + { + case BinaryLogicalOperation::AND: + res = wrapper::vand(a, b); + break; + case BinaryLogicalOperation::OR: + res = wrapper::vorr(a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <BinaryLogicalOperation op> +inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) +{ + uint8x16x4_t out = {{ + elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), + elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), + }}; + return out; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline VectorType elementwise_logic_op_broadcast(const VectorType &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); + } + return x; +} + +template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> +void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, + const Window &window) +{ + elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, + &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, + &elementwise_logic_op_loop<op, ScalarType, VectorType>); +} + +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( + const ITensor *input1, const ITensor *input2, ITensor *output, + std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) +{ + std::string function_to_call("op_"); + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if (it != map_function.end()) + { + auto func = it->second; + return [func](const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window) { func(input1, input2, output, window); }; + } + return nullptr; +} + +template <BinaryLogicalOperation op> +std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> +configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { + {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + + return configure_func(input1, input2, output, map_function); +} + +void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, + const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); + configure_common(input1, input2, output); + switch (op) + { + case BinaryLogicalOperation::AND: + _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); + break; + case BinaryLogicalOperation::OR: + _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) +{ + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, + DataType::QASYMM8); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); + + const TensorShape out_shape = + TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); + return Status{}; +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp new file mode 100644 index 000000000..12017e543 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/SaturateCast.h" + +#include "arm_compute/core/NEON/wrapper/wrapper.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {} + +void NECastBoolKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + _input = input; + _output = output; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICPPKernel::configure(win); +} + +Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + +void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output); + ARM_COMPUTE_ERROR_ON(_input == _output); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win); + Iterator output(_output, win); + + const uint8_t true_val = 1; + const uint8x8_t mask_bool = vdup_n_u8(true_val); + + switch (_output->info()->data_type()) + { + case DataType::S8: + { + /* Conversion U8 -> S8 */ + execute_window_loop(win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_s8(output_ptr + x, vreinterpretq_s8_u8(vandq_u8( + texels_u8, vdupq_n_u8(true_val)))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s16(output_ptr + x, texels.val[0]); + vst1q_s16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::F32: + { + /* Up-conversion U8 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(output_ptr + x + 12, + vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); + *(output_ptr + x) = static_cast<float>(in); + } + }, + input, output); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Up-conversion U8 -> F16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::U8: + { + /* Conversion U8 -> S8 */ + execute_window_loop(win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), + vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; + + vst1q_u16(output_ptr + x, texels.val[0]); + vst1q_u16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp new file mode 100644 index 000000000..091d38c56 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() + : _input(nullptr), _lookups(nullptr), _output(nullptr) +{ +} + +void NEEmbeddingLookupKernel::configure(const ITensor *input, ITensor *output, + const ITensor *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), lookups->info())); + + _input = input; + _output = output; + _lookups = lookups; + + // Auto initialize output if not initialized + auto out_shape = input->info()->tensor_shape(); + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions()); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, + const arm_compute::ITensorInfo *output, + const arm_compute::ITensorInfo *lookups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + return Status{}; +} + +void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const int32_t lookup = *reinterpret_cast<int32_t *>( + _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp new file mode 100644 index 000000000..93963a504 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the indices + * + * Validate that indices are not negative + * + * @param[in] indices Indices tensor info. + */ +template <typename U> void validate_indices(const ITensor *indices) +{ + for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i) + { + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0); + } +} + +} // namespace + +NEGatherKernelEx::NEGatherKernelEx() + : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} +{ +} + +template <typename U> +inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +template <typename U> +void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + // Validate that the indices are not negative + validate_indices<U>(_indices); + + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator output_it(_output, output_window); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank, _axis); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), + output_it.ptr()); + }, + output_it); +} + +void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); + ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + _input = input; + _indices = indices; + _output = output; + _axis = axis; + _indices_rank = indices->info()->num_dimensions(); + + if (_axis < 0) + { + _axis += input->info()->num_dimensions(); + } + ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions())); + + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEGatherKernelEx::gather_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEGatherKernelEx::gather_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); + + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); + + if (axis < 0) + { + axis += input->num_dimensions(); + } + + ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( + input->tensor_shape(), indices->tensor_shape(), axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + return Status{}; +} + +void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window, info); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp new file mode 100644 index 000000000..30787c0a4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <unordered_map> + +using namespace arm_compute; + +namespace +{ +constexpr size_t NOT_HIT = 0xFFFFFFFF; +} // namespace + +NEHashtableLookupKernel::NEHashtableLookupKernel() + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} +{ +} + +void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *keys, + const ITensor *input, ITensor *output, ITensor *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_THROW_ON( + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + + _lookups = lookups; + _keys = keys; + _input = input; + _output = output; + _hits = hits; + + // Auto initialize output if not initialized + auto out_shape{input->info()->tensor_shape()}; + out_shape.set(out_shape.num_dimensions() - 1, lookups->info()->num_dimensions(), false); + auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + // Auto initialize hits if not initialized + auto_init_if_empty(*hits->info(), lookups->info()->tensor_shape(), 1, DataType::U8); + + INEKernel::configure(calculate_max_window(*output->info())); +} + +Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); + + ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(lookups->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(keys->dimension(0) != input->dimension(input->num_dimensions() - 1)); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() != output->num_dimensions()); + ARM_COMPUTE_ERROR_ON(output->dimension(output->num_dimensions() - 1) != lookups->dimension(0)); + for (size_t i = 0; i < output->num_dimensions() - 1; ++i) + { + ARM_COMPUTE_ERROR_ON(input->dimension(i) != output->dimension(i)); + } + } + + // Validate in case of configured hits + if (hits->total_size() > 0) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != output->dimension(output->num_dimensions() - 1)); + ARM_COMPUTE_ERROR_ON(hits->dimension(0) != lookups->dimension(0)); + ARM_COMPUTE_ERROR_ON(hits->num_dimensions() > 1); + } + + return Status{}; +} + +void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t lookup_dim = _output->info()->num_dimensions() - 1; + const int const_0 = _output->info()->data_type() == DataType::QASYMM8 + ? _output->info()->quantization_info().uniform().offset + : 0; + + std::unordered_map<int32_t, size_t> key_index_map; + for (size_t n = 0; n < _keys->info()->dimension(0); ++n) + { + const int32_t key = *reinterpret_cast<int32_t *>(_keys->ptr_to_element({n})); + key_index_map[key] = n; + } + std::vector<size_t> lookup_indices; + for (size_t k = 0; k < _lookups->info()->dimension(0); ++k) + { + const int32_t key = *reinterpret_cast<int32_t *>(_lookups->ptr_to_element({k})); + const auto it = key_index_map.find(key); + if (it == key_index_map.end()) + { + lookup_indices.emplace_back(NOT_HIT); + *_hits->ptr_to_element({k}) = 0; + } + else + { +#if defined(ARM_COMPUTE_DEBUG_ENABLED) + if (it->second >= _keys->info()->dimension(0)) + ARM_COMPUTE_ERROR("HashTable Lookup: Index out of bounds."); +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + lookup_indices.emplace_back(it->second); + *_hits->ptr_to_element({k}) = 1; + } + } + + Window output_window{window}; + output_window.set(Window::DimX, + Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); + + Window out_slice = output_window.first_slice_window_4D(); + do + { + Iterator output_it(_output, out_slice); + + execute_window_loop(out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + + }, + output_it); + + } while (window.slide_window_slice_4D(out_slice)); +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp new file mode 100644 index 000000000..49adf1462 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +template <typename T> +void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma, ITensor *beta, + float epsilon, const Window &window) +{ + /** NEON vector tag type. */ + using ExactTagType = + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + const auto channel_idx = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); + + if (output != nullptr && output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); + } + + if (gamma != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), + "Gamma's size must be the same as size of input's channel"); + } + + if (beta != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), + "Beta's size must be the same as size of input's channel"); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // We handle the planes manually + Window win = calculate_max_window(*input, Steps(1)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); + + // NEInstanceNormalizationLayerKernelEx doesn't need padding so update_window_and_padding() can be + // skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace + +NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) +{ +} + +void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *output, + ITensor *gamma, ITensor *beta, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _input = input; + _output = output == nullptr ? input : output; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + + if (_input->info()->data_type() == DataType::F32) + { + _func = &instance_normalization_nchw<float>; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (_input->info()->data_type() == DataType::F16) + { + _func = &instance_normalization_nchw<float16_t>; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else + { + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info()); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + INEKernel::configure(std::get<1>(win_config)); +} + +Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *gamma, + const ITensorInfo *beta, float epsilon) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + return Status{}; +} + +void NEInstanceNormalizationLayerKernelEx::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + (*_func)(_input, _output, _gamma, _beta, _epsilon, window); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp new file mode 100644 index 000000000..b92130cec --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_factor, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + // Checks performed when output is configured + if ((output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} + +inline int32x4x4_t load_value(const int32_t *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename T> inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) +{ + const float32x4_t vscale = vdupq_n_f32(scale); + + const float32x4x4_t ret = {{ + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + }}; + return ret; +} +} // namespace + +NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) +{ +} + +void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor *scale_factor, + ITensor *output, float multiplier) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), scale_factor->info(), output->info())); + + _input = input; + _scale_factor = scale_factor; + _output = output; + _multiplier = multiplier; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEMultiplyScaleFactorKernel::validate(const ITensorInfo *input, + const ITensorInfo *scale_factor, + const ITensorInfo *output, float multiplier) +{ + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); + + return Status{}; +} + +template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); +} + +void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_output->info()->data_type()) + { + case DataType::F32: + NEMultiplyScaleFactorKernel::multiply<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEMultiplyScaleFactorKernel::multiply<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp new file mode 100644 index 000000000..0a11eb509 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +namespace arm_compute +{ +namespace +{ +/** Validate the depth + * + * Validate that depth are not negative + * + * @param[in] depth Depth tensor. + * @param[in] output Output tensor. + * @param[in] axis Axis of depth. + */ +template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0); + ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) != + *(reinterpret_cast<U *>(depth->buffer()))); +} + +Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis || + actual_axis >= static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + } + + return Status{}; +} + +template <typename U, typename Enable = void> bool isOnValue(U) { return true; } + +template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0> +bool isOnValue(U index, U depth) +{ + return index >= 0 && index < depth; +} +} // namespace + +NEOneHotKernel::NEOneHotKernel() + : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1}, + _output{nullptr}, _func{} +{ +} + +template <typename U> +void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the depth are not negative + validate_depth<U>(_depth, _output, _axis); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator output_it(_output, output_window); + const U off_value = *reinterpret_cast<U *>(_off_value->buffer()); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + std::fill_n(output_it.ptr(), + _output->info()->dimension(0) * _output->info()->element_size(), off_value); + Coordinates indices_id(id); + indices_id.remove(0); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(0, new_index); + std::copy_n(_on_value->buffer(), _output->info()->element_size(), + _output->ptr_to_element(onehot_id)); + } + }, + output_it); +} + +template <typename U> +inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the indices are not negative + validate_depth<U>(_depth, _output, _axis); + Iterator output_it(_output, window); + execute_window_loop(window, + [&](const Coordinates &id) { + Coordinates indices_id(id); + indices_id.remove(_axis); + const U new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(_axis, new_index); + std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() + : _off_value->buffer(), + _output->info()->element_size(), output_it.ptr()); + } + }, + output_it); +} + +void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth, + const ITensor *on_value, const ITensor *off_value, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(), + off_value->info(), output->info(), axis)); + _indices = indices; + _depth = depth; + _on_value = on_value; + _off_value = off_value; + _output = output; + _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + INEKernel::configure(win); +} + +Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(indices, depth, on_value, off_value, output, axis)); + return Status{}; +} + +void NEOneHotKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + (this->*_func)(window, info); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp new file mode 100644 index 000000000..5841f1d69 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEAsymm.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/CPP/Validate.h" + +#include <arm_neon.h> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(scale_factor->dimension(0) != input->dimension(1)); + + return Status{}; +} + +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline const float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float32x4_t round(const float32x4_t &fv) +{ + const float32x4_t point5_f32x4 = vdupq_n_f32(0.5f); + const float32x4_t zero_f32x4 = vdupq_n_f32(0.0f); + // If value < 0, mask = -1, else mask = 0 + int32x4_t mask_less_zero_ui32x4 = reinterpret_cast<int32x4_t>(vcltq_f32(fv, zero_f32x4)); + return vaddq_f32(fv, vaddq_f32(vcvtq_f32_s32(mask_less_zero_ui32x4), point5_f32x4)); +} + +inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, int32_t max_scale) +{ + const float32x4_t vinvscale = vdupq_n_f32(scale_factor_inv); + const int32x4_t vposend = vdupq_n_s32(max_scale); + const int32x4_t vnagend = vdupq_n_s32(-max_scale); + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, + vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#else //__aarch64__ + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), +#endif //__aarch64__ + }}; + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + return vcombine_s8(pa, pb); +} +} // namespace + +NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) +{ +} + +void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *output, + ITensor *scale_factor) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), scale_factor->info())); + + _input = input; + _output = output; + _scale_factor = scale_factor; + + // Configure kernel window + Window win_config = calculate_max_window(*input->info(), Steps()); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win_config); +} + +Status NEQuantizationSymmetricKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *scale_factor) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, scale_factor)); + + return Status{}; +} + +template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window &window) +{ + constexpr auto window_step = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + // Support Only 2D input + Window win_collapsed = window; + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + const auto dim_x = _input->info()->dimension(0); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + execute_window_loop( + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); +} + +void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch (_input->info()->data_type()) + { + case DataType::F32: + NEQuantizationSymmetricKernel::quantize<float>(window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + NEQuantizationSymmetricKernel::quantize<float16_t>(window); + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} diff --git a/compute/ARMComputeEx/src/core/UtilsEx.cpp b/compute/ARMComputeEx/src/core/UtilsEx.cpp new file mode 100644 index 000000000..863316909 --- /dev/null +++ b/compute/ARMComputeEx/src/core/UtilsEx.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Error.h" + +using namespace arm_compute; + +const std::pair<unsigned int, unsigned int> +arm_compute::transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + const unsigned int padx = info.pad_left() + info.pad_right(); + const unsigned int pady = info.pad_top() + info.pad_bottom(); + + ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1); + ARM_COMPUTE_ERROR_ON(kernel_width <= padx); + ARM_COMPUTE_ERROR_ON(kernel_height <= pady); + + // Find the transpose conv out dimensions + // transpose conv out: + // tconv_out + pad = 1 + (in - 1) * stride + invalid + // tconv_out = 1 + (in - 1) * stride + invalid - pad + const int w = stride_x * (in_width - 1) + kernel_width - padx + invalid_right; + const int h = stride_y * (in_height - 1) + kernel_height - pady + invalid_bottom; + + return std::make_pair<unsigned int, unsigned int>(w, h); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp new file mode 100644 index 000000000..158fe0b0c --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/CLFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/CLFunctionsEx.h" + +// NOTE This empty file aims to validate "CLFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp new file mode 100644 index 000000000..267228eac --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/Utils.h" + +namespace arm_compute +{ +CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), + _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() +{ +} + +Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + const unsigned int num_of_stages = + calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + + DataType output_data_type = DataType::S32; + TensorInfo not_reshaped_output; + const auto input_num_channles = input->num_channels(); + const auto input_qinfo = input->quantization_info(); + + if (output->total_size() != 0) + { + output_data_type = output->data_type(); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, + false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); + } + + auto shape_before_reshape = input->tensor_shape(); + shape_before_reshape.set(axis, 1); + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, + int num_channels, QuantizationInfo qinfo) { + ti.set_data_type(data_type) + .set_tensor_shape(shape) + .set_num_channels(num_channels) + .set_quantization_info(qinfo); + }; + + initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, + input_num_channles, input_qinfo); + + if (num_of_stages == 1) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); + } + else + { + // Create temporary tensor infos + std::vector<TensorInfo> sums_vector(num_of_stages - 1); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (unsigned int i = 0; i < num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + sums_vector[i].set_data_type(input->data_type()); + sums_vector[i].set_tensor_shape(shape); + sums_vector[i].set_num_channels(input->num_channels()); + } + + // Validate ReductionOperation only on first kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); + + // Validate ReductionOperation on intermediate stages + for (unsigned int i = 1; i < num_of_stages - 1; ++i) + { + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], + &sums_vector[i], axis, op)); + } + + // Validate ReductionOperation on the last stage + const unsigned int last_stage = num_of_stages - 1; + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( + input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); + } + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); + return Status{}; +} + +void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); + _reduction_axis = axis; + + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( + input->info()->tensor_shape(), axis, false); + DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) + ? DataType::S32 + : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + // Configure reduction operation kernels + _reduction_kernels_vector.resize(_num_of_stages); + + _memory_group.manage(&_not_reshaped_output); + // Create temporary tensors + if (_num_of_stages == 1) + { + // Force an early initialization for int64 output type + TensorShape output_shape{input->info()->tensor_shape()}; + output_shape.set(axis, 1); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + _not_reshaped_output.info()->set_tensor_shape(output_shape); + _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); + } + else + { + _results_vector.resize(_num_of_stages - 1); + TensorShape shape{input->info()->tensor_shape()}; + for (unsigned int i = 0; i < _num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + _results_vector[i].allocator()->init( + input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); + } + + // Apply ReductionOperation only on first kernel + _memory_group.manage(&_results_vector[0]); + _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op); + + // Apply ReductionOperation on intermediate stages + for (unsigned int i = 1; i < _num_of_stages - 1; ++i) + { + _memory_group.manage(&_results_vector[i]); + _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], + axis, op); + _results_vector[i - 1].allocator()->allocate(); + } + + // Apply ReductionOperation on the last stage + const unsigned int last_stage = _num_of_stages - 1; + _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], + &_not_reshaped_output, axis, op); + _results_vector[last_stage - 1].allocator()->allocate(); + } + _reshape_kernel.configure(&_not_reshaped_output, output); + _not_reshaped_output.allocator()->allocate(); +} + +void CLArgMinMaxLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _num_of_stages; ++i) + { + CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + } + CLScheduler::get().enqueue(_reshape_kernel, false); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp new file mode 100644 index 000000000..e5122ab8f --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h" + +#include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +using namespace arm_compute; + +void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + BinaryLogicalOperation op) +{ + auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + k->configure(input1, input2, output, op); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp new file mode 100644 index 000000000..c7d0ac8e2 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLCastBool.h" + +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" + +using namespace arm_compute; + +void CLCastBool::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp new file mode 100644 index 000000000..3dede0562 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <memory> +#include <tuple> + +namespace arm_compute +{ +using namespace arm_compute::misc::shape_calculator; + +CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( + std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _scale_f(), + _conv_f(), + _flip_weights(), + _scaled_output(), + _original_weights(nullptr), + _weights_flipped(), + _flip_axis(), + _is_prepared(false) +{ +} + +Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights); + + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], + "Output's depth is invalid."); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, weights_info)); + + return Status{}; +} + +void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, + invalid_right, invalid_bottom, weights_info); +} + +void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context, + ICLTensor *input, ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + const DataLayout data_layout = input->info()->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + _original_weights = weights; + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + // Output auto initialization if not yet initialized + auto_init_if_empty( + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + _is_prepared = weights_info.retain_internal_weights(); + + _memory_group.manage(&_scaled_output); + + // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order + // to match output shape + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + // configure scale function + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + _scale_f.configure(input, &_scaled_output, upsample_info); + + // Setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info, + weights_info); + _scaled_output.allocator()->allocate(); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + _flip_axis.map(true); + auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); + if (weights->info()->data_layout() == DataLayout::NHWC) + { + axis_data[0] = 1; + axis_data[1] = 2; + } + else + { + axis_data[0] = 0; + axis_data[1] = 1; + } + _flip_axis.unmap(); +} + +void CLDirectTransposeConvLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _scale_f.run(); + _conv_f.run(); +} + +void CLDirectTransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + // Free flipped weights + if (!_weights_flipped.is_used()) + { + _weights_flipped.allocator()->free(); + } + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp new file mode 100644 index 000000000..ae9d8afc6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" + +#include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" + +using namespace arm_compute; + +void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, + const ICLTensor *lookups) +{ + auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..01989461e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" + +#include <algorithm> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_UNUSED(input); + ARM_COMPUTE_UNUSED(weights); + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = support::cpp14::make_unique<CLTransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return CLTransposeKernel::validate(input, output); +} + +CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), + _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), + _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), + _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), + _original_weights(nullptr) +{ +} +void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, + ICLTensor *output, bool retain_internal_weights) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_UNUSED(retain_internal_weights); + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _is_prepared = fc_info.retain_internal_weights; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.set_target(CLScheduler::get().target()); + _accumulate_biases_kernel.configure(output, biases); + } + + const ICLTensor *weights_to_use = weights; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool is_fc_after_conv = false; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(is_fc_after_conv, + "CLFullyConnectedHybridLayer does not support after conv"); + ARM_COMPUTE_UNUSED(is_fc_after_conv); + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Extract scale factor + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); + _memory_group.manage(&_scale_factor); + _scale_factor_kernel.configure(input, &_scale_factor); + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + _memory_group.manage(&_quantized_input); + _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); + + // GEMMLowp + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + _memory_group.manage(&_gemmlowp_output); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output, + fc_info.retain_internal_weights); + _quantized_input.allocator()->allocate(); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().uniform().scale); + _gemmlowp_output.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + const GPUTarget gpu_target = CLScheduler::get().target(); + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1 && input->dimension(1) > 1; + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_fc_after_conv, + "CLFullyConnectedHybridLayer does not support after conv"); + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Validate Scale factor kernel + const ITensorInfo &scale_factor = + TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); + + // Validate quantization symm8 kernel + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate matrix multiply kernel + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + // Multiply scale + ARM_COMPUTE_RETURN_ON_ERROR( + CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); + + return Status{}; +} + +void CLFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Extract scale_factor + CLScheduler::get().enqueue(_scale_factor_kernel); + + // Quantize input + CLScheduler::get().enqueue(_quant_input_kernel); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + CLScheduler::get().enqueue(_multiply_scale_kernel); + + // Accumulate biases if provided + if (_accumulate_biases) + { + CLScheduler::get().enqueue(_accumulate_biases_kernel); + } +} + +void CLFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](CLTensor *w) { + if (!w->is_used()) + { + CLScheduler::get().queue().finish(); + w->allocator()->free(); + } + }; + + // Reshape of the weights if needed (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_kernel.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..2ff4b9659 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/Cast.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" + +#include <algorithm> + +namespace arm_compute +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::utils::cast; + +namespace +{ +Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, + const ITensorInfo &output, + GEMMLowpOutputStageInfo &gemmlowp_output_stage) +{ + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + // Configure output stage for quantized case + if (is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output.quantization_info().uniform(); + + const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info; + + const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( + multiplier, &output_multiplier, &output_shift)); + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_min_bound = 0; + gemmlowp_output_stage.gemmlowp_max_bound = 255; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, + const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if (is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(iq_info.scale, -iq_info.offset); + const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); + } + + return Status{}; +} +} // namespace + +void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) +{ + auto k = support::cpp14::make_unique<CLTransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return CLTransposeKernel::validate(input, output); +} + +CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager, + IWeightsManager *weights_manager) + : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), + _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), + _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), + _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), + _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) +{ +} +void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), + gemmlowp_output_stage); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info(QuantizationInfo( + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->info()->set_quantization_info(QuantizationInfo( + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, bias, output, 1.f, 1.f, gemm_info); + } +} + +void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init(input->info() + ->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(shape_flatten) + .set_data_layout(DataLayout::NCHW)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_layer.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, bias, output, fc_info); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void CLFullyConnectedLayerEx::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *bias, ICLTensor *output, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, bias, output, fc_info); +} + +void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor *weights, + const ICLTensor *biases, ICLTensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _is_prepared = fc_info.retain_internal_weights; + _original_weights = weights; + + if (_weights_manager) + { + _weights_manager->manage(weights); + } + + const ICLTensor *weights_to_use = weights; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + if (_weights_manager && _weights_manager->are_weights_managed(weights)) + { + _reshape_weights_managed_function.configure(weights); + weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->acquire(weights, &_reshape_weights_managed_function)); + } + else + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + if (_weights_manager && _weights_manager->are_weights_managed(weights_to_use)) + { + _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(), + fc_info.weights_trained_layout); + weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->acquire(weights, &_convert_weights_managed)); + } + else + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + } + _are_weights_converted = false; + } + + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, biases, output, fc_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, biases, output, fc_info); + } +} + +Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_input = TensorInfo(input->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(input)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR( + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); + + return Status{}; +} + +void CLFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + if (!_weights_manager) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + } + + // Pointer to current weights + const ICLTensor *cur_weights = _original_weights; + // Reshape of the weights + if (!_are_weights_reshaped) + { + if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>( + _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + } + else + { + _reshape_weights_function.run(); + cur_weights = &_reshape_weights_output; + } + } + + // Convert weights if needed + if (!_are_weights_converted) + { + if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _weights_manager->run(cur_weights, &_convert_weights_managed); + } + else + { + _convert_weights.run(); + } + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + _flatten_layer.run(); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } +} + +void CLFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if(!_is_prepared) + { + if(!_weights_manager) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + } + + auto release_unused = [](CLTensor * w) + { + if(!w->is_used()) + { + CLScheduler::get().queue().finish(); + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ICLTensor *cur_weights = _original_weights; + + // Reshape of the weights if needed (happens only once) + if(!_are_weights_reshaped) + { + if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) + { + cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + } + else + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + } + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if(!_are_weights_converted) + { + if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) + { + _weights_manager->run(cur_weights, &_convert_weights_managed); + } + else + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + cur_weights->mark_as_unused(); + } + + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..157b4d977 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> +#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *input, + const arm_compute::ICLTensor *weights, + const arm_compute::ICLTensor *biases, + arm_compute::ICLTensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ICLTensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); + _cl_reshape.configure(_input, &_cl_buffer); + input_to_use = &_cl_buffer; + } + + _cl_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::CLFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS) + { + bool is_hybrid = (input->info()->data_type() == DataType::F32 || + input->info()->data_type() == DataType::F16) && + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::CLFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + else + { + throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); + } + + }(); + + if (_needs_reshape) + { + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } +} + +void CLFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _cl_reshape.run(); + + _cl_fc->run(); +} + +void CLFullyConnectedReshapingLayer::prepare(void) { _cl_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp new file mode 100644 index 000000000..e0b833b04 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLGatherEx.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherExKernel.h" + +using namespace arm_compute; + +void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, + int axis) +{ + auto k = support::cpp14::make_unique<CLGatherExKernel>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status CLGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return CLGatherExKernel::validate(input, indices, output, axis); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp new file mode 100644 index 000000000..65b89a389 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLHashtableLookup.h" + +#include "arm_compute/core/CL/kernels/CLHashtableLookupKernel.h" + +using namespace arm_compute; + +void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, + const ICLTensor *input, ICLTensor *output, ICLTensor *hits) +{ + auto k = support::cpp14::make_unique<CLHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..5a7e40839 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} + +void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, + ICLTensor *gamma, ICLTensor *beta, float epsilon) +{ + auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + k->configure(input, output, gamma, beta, epsilon); + _kernel = std::move(k); +} + +Status CLInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return CLInstanceNormalizationLayerKernelEx::validate(input, output, gamma, beta, epsilon); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp new file mode 100644 index 000000000..28e5bc0da --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLNeg.h" + +#include "arm_compute/core/CL/kernels/CLNegKernel.h" + +using namespace arm_compute; + +void CLNeg::configure(ICLTensor *input, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp new file mode 100644 index 000000000..aa9f32ec6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLOneHot.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/MemorySupport.h" +namespace arm_compute +{ +CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis) +{ + _has_to_memset = true; + _memset_kernel.configure(output, off_value); + _onehot_kernel.configure(indices, on_value, output, depth, axis); +} +Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::run() +{ + if (_has_to_memset) + { + CLScheduler::get().enqueue(_memset_kernel, true); + } + + CLScheduler::get().enqueue(_onehot_kernel, false); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp new file mode 100644 index 000000000..02ee4ad8a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLReduceOperation.h" + +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() +{ +} + +Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, + const std::set<uint32_t> &axis, bool keep_dims, + const ReductionOperation &op) +{ + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1); + + // Create temporary tensor infos + auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + auto it = axis.begin(); + for (size_t i = 0; i < num_of_interm_tensors; ++i, ++it) + { + shape.set(*it, 1, false); + interm_tensors[i].set_data_type(input->data_type()); + interm_tensors[i].set_tensor_shape(shape); + interm_tensors[i].set_num_channels(input->num_channels()); + interm_tensors[i].set_data_layout(input->data_layout()); + interm_tensors[i].set_quantization_info(input->quantization_info()); + } + + // Set a vector that is ordered ITensorInfo sequentially. + std::vector<const ITensorInfo *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Validate ReduceOperation only on all kernels + it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + } + + if (!keep_dims) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + } + + return Status{}; +} + +void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, + const std::set<uint32_t> &axis, bool keep_dims, + ReductionOperation op) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); + + _axis = axis; + + _input = input; + _output = output; + _keep_dims = keep_dims; + + // NOTE The axis must have no duplication. + const size_t num_of_kernels = axis.size(); + const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); + + if (num_of_kernels < 1) + { + throw std::runtime_error("CLReduceOperation: there is no axis to reduce"); + } + + _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + + // Set a vector that is ordered ICLTensors sequentially. + std::vector<ICLTensor *> tensors; + tensors.emplace_back(input); + for (size_t i = 0; i < num_of_interm_tensors; ++i) + { + tensors.emplace_back(_interm_tensors.get() + i); + } + tensors.emplace_back(output); + + // Apply ReductionOperation on all kernels + TensorShape shape{input->info()->tensor_shape()}; + auto it = axis.begin(); + for (size_t i = 0; i < num_of_kernels; ++i, ++it) + { + shape.set(*it, 1, false); + if (!keep_dims || i != (num_of_kernels - 1)) + { + _interm_tensors[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); + _memory_group.manage(&_interm_tensors[i]); + } + _reduce_kernels[i].configure(tensors[i], tensors[i + 1], *it, op); + if (i != 0) + { + _interm_tensors[i - 1].allocator()->allocate(); + } + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + _reshape.configure(&_interm_tensors[num_of_interm_tensors - 1], output); + _interm_tensors[num_of_interm_tensors - 1].allocator()->allocate(); + } +} + +void CLReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + const size_t num_of_kernels = _axis.size(); + for (size_t i = 0; i < num_of_kernels; ++i) + { + CLScheduler::get().enqueue(_reduce_kernels[i]); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp new file mode 100644 index 000000000..a502f032e --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLSplitVEx.h" +#include "support/ToolchainSupport.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include <cassert> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs, + unsigned int num_splits) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1, + "size_splits must be a 1-D tensor."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(), + "Number of output tensors does not match number of splits."); + return Status{}; +} + +Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, + uint32_t split_dim) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2); + + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + unsigned int axis_offset = 0; + // Validate output tensors + for (const auto &output : outputs) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + // Get output shape + const TensorShape output_shape = output->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->clone(); + auto_init_if_empty(tmp_output_info, + input->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords)); + + axis_offset += axis_split_step; + } + + return Status{}; +} + +void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, + std::vector<CLSlice> &_slice_functions, uint32_t split_dim) +{ + unsigned int axis_offset = 0; + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + int out_iter = 0; + for (const auto &output : outputs) + { + const TensorShape output_shape = output->info()->tensor_shape(); + auto op_size = output_shape.total_size(); + if (!op_size) + { + continue; + } + + assert(op_size != 0); + assert(split_dim <= output_shape.num_dimensions()); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->info()->clone(); + auto_init_if_empty( + tmp_output_info, + input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + // Configure slice function + _slice_functions[out_iter].configure(input, output, start_coords, end_coords); + + // Set valid region from shape + outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape)); + axis_offset += axis_split_step; + } +} + +} // namespace + +CLSplitVEx::CLSplitVEx() + : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() +{ +} + +void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits)); + + _input = input; + _size_splits = size_splits; + _outputs = outputs; + _num_splits = num_splits; + + // Create tensor slices + _slice_functions.resize(_num_splits); + + // Extract output tensor info + std::vector<ITensorInfo *> outputs_info; + for (auto &output : _outputs) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + outputs_info.emplace_back(output->info()); + } + + // Validate slices + ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim)); + + // Configure slices + configure_slices(_input, _outputs, _slice_functions, split_dim); +} + +void CLSplitVEx::run() +{ + // execute the slices + for (unsigned i = 0; i < _outputs.size(); ++i) + { + _slice_functions[i].run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..3ac95a8e6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "arm_compute/core/CL/ICLTensor.h" + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + +// Disable GPU implementation +// TODO Enable GPU implementation with verification, or remove code +// Invalid result on GPU +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else +#endif // Disable GPU implementation + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; +#if 0 + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else +#endif + { + run_on_cpu(); + } +} + +#if 0 +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} +#endif // Disable GPU implementation + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp new file mode 100644 index 000000000..3215d01a7 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include <cmath> +#include <memory> +#include <tuple> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_manager(std::move(memory_manager)), _function() +{ +} + +void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, + ICLTensor *output, const PadStrideInfo &deconv_info, + unsigned int invalid_right, unsigned int invalid_bottom, + const WeightsInfo &weights_info) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, + invalid_right, invalid_bottom, weights_info); +} + +void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, + ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, + output->info(), deconv_info, invalid_right, + invalid_bottom, weights_info)) + { + case DeconvolutionMethod::DIRECT: + { + auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>(); + f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, + invalid_bottom, weights_info); + _function = std::move(f); + break; + } + case DeconvolutionMethod::GEMM: + { + auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); + f->configure(compile_context, input, weights, bias, output, deconv_info); + _function = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } +} + +Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, ITensorInfo *output, + const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + switch (CLTransposeConvLayer::get_deconvolution_method( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + { + case DeconvolutionMethod::DIRECT: + { + // Validate direct convolution layer + ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + break; + } + case DeconvolutionMethod::GEMM: + { + // Validate gemm-based convolution layer + ARM_COMPUTE_RETURN_ON_ERROR( + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_UNUSED(output, bias, weights_info); + + const DataLayout data_layout = input->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + if (weights->dimension(idx_w) != deconv_info.stride().first || + weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 || + invalid_bottom != 0) + { + return DeconvolutionMethod::DIRECT; + } + + return DeconvolutionMethod::GEMM; +} + +void CLTransposeConvLayer::run() +{ + prepare(); + _function->run(); +} + +void CLTransposeConvLayer::prepare() { _function->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp new file mode 100644 index 000000000..80fbf359d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/NEFunctionsEx.cpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/NEFunctionsEx.h" + +// NOTE This empty file aims to validate "NEFunctionsEx.h". +// DO NOT REMOVE this file. diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp new file mode 100644 index 000000000..2fc94b267 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" +#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> + +#include "arm_compute/core/ITensor.h" +#include "support/MemorySupport.h" + +#include <utility> + +namespace arm_compute +{ + +template <BinaryLogicalOperation COP> +void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, + ITensor *output) +{ + auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(COP, input1, input2, output); + _kernel = std::move(k); +} + +template <BinaryLogicalOperation COP> +Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) +{ + return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); +} + +void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, + BinaryLogicalOperation op) +{ + auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + k->configure(op, input1, input2, output); + _kernel = std::move(k); +} + +Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, BinaryLogicalOperation op) +{ + return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); +} + +// Supported Specializations +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; +template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp new file mode 100644 index 000000000..6ad3e1b12 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NECastBool.h" + +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NECastBool::configure(const ITensor *input, ITensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NECastBoolKernel::validate(input, output); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp new file mode 100644 index 000000000..e0ab3e025 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" + +#include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) +{ + auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + k->configure(input, output, lookups); + _kernel = std::move(k); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp new file mode 100644 index 000000000..a123439d9 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + + return Status{}; +} +} // namespace + +void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) +{ + auto k = support::cpp14::make_unique<NETransposeKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, + const ITensorInfo *output) +{ + return NETransposeKernel::validate(input, output); +} + +NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) +{ +} + +void NEFullyConnectedHybridLayer::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); +} + +void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _accumulate_biases = false; + _original_weights = weights; + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + bool _is_fc_after_conv; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1 && input->info()->dimension(1) > 1; + } + ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, + "NEFullyConnectedHybridLayer does not support after conv"); + (void)_is_fc_after_conv; + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_output.allocator()->init( + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); + _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Quantize input + _quantized_input.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + _scale_factor.allocator()->init( + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); + + // GEMM + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); + + // Multiply scale + _multiply_scale_kernel.configure(&_gemmlowp_output, &_scale_factor, output, + weights->info()->quantization_info().uniform().scale); + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; + + _quantized_input.allocator()->allocate(); + _scale_factor.allocator()->allocate(); + _gemmlowp_output.allocator()->allocate(); +} + +Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + + // Validate quantization kernel + const ITensorInfo &quantized_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); + const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( + &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); + + return Status{}; +} + +void NEFullyConnectedHybridLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Quantize input + NEScheduler::get().schedule(&_quant_input_kernel, Window::DimY); + + // Run matrix multiply + _mm_gemmlowp.run(); + + // Multiply scale factor + NEScheduler::get().schedule(&_multiply_scale_kernel, Window::DimY); + + // Accumulate biases if provided + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } +} + +void NEFullyConnectedHybridLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + _are_weights_reshaped = true; + // We can not release _original_weights because it can be used in other nodes + } + + // Prepare GEMM prepare and release unused weights + _mm_gemmlowp.prepare(); + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + _is_prepared = true; + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp new file mode 100644 index 000000000..cb7557a5a --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include <algorithm> +#include <cmath> + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +{ + if (is_data_type_quantized_asymmetric(input.data_type())) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info(input.quantization_info().uniform().scale, + -input.quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights.quantization_info().uniform().scale, + -weights.quantization_info().uniform().offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( + &input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + } + + return Status{}; +} +} // namespace + +NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), + _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), + _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), + _accumulate_biases(false), _is_quantized(false), _is_prepared(false) +{ +} + +void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change + // QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = input->info()->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); + + input->info()->set_quantization_info(QuantizationInfo( + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->info()->set_quantization_info(QuantizationInfo( + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp.configure(input, weights, nullptr, output); + + // Revert back QuantizatioInfo as input and weights could be used in other fully connected + // layers + input->info()->set_quantization_info(input_quantization_info); + weights->info()->set_quantization_info(weights_quantization_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + } +} + +void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON( + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be + // linearized + + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init( + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + shape_flatten)); + + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_kernel.configure(input, &_flatten_output); + + // Configure matrix multiply kernel + configure_mm(&_flatten_output, weights, output); + + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); +} + +void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(input, weights, output); +} + +void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, + const ITensor *biases, ITensor *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); + _original_weights = weights; + + // Configure gemmlowp output + if (_is_quantized) + { + _gemmlowp_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::S32)); + } + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !_is_quantized) + { + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensor *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + // Reshape weights if needed + if (!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights_function.configure(weights, &_reshape_weights_output); + weights_to_use = &_reshape_weights_output; + } + + // Convert weights if needed + if (_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights.configure(weights_to_use, &_converted_weights_output, + input->info()->tensor_shape(), fc_info.weights_trained_layout); + + weights_to_use = &_converted_weights_output; + _are_weights_converted = false; + } + + ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, tmp_output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, tmp_output); + } + + // Configure output stage for asymmetric quantized types + if (_is_quantized) + { + float multiplier = input->info()->quantization_info().uniform().scale * + weights->info()->quantization_info().uniform().scale / + output->info()->quantization_info().uniform().scale; + int output_multiplier; + int output_shift; + quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, + &output_shift); + _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, + output_shift, + output->info()->quantization_info().uniform().offset); + _gemmlowp_output.allocator()->allocate(); + } + + _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; +} + +Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *biases, const ITensorInfo *output, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); + const ITensorInfo &reshaped_weights = + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &gemmlowp_output = TensorInfo( + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + + // Configure accumulate biases kernel for non quantized asymmetric types + if (biases != nullptr && !is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *input_to_use = input; + const ITensorInfo *weights_to_use = weights; + const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->dimension(1) > 1; + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->tensor_shape().cbegin() + 3, input->tensor_shape().cend(), + output->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = input->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR( + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + input_to_use = &flatten_input; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); + + // Validate output stage for asymmetric quantized types + if (is_quantized) + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( + &gemmlowp_output, biases, output)); + } + + return Status{}; +} + +void NEFullyConnectedLayerEx::run() +{ + if (!_is_prepared) + { + if (!_are_weights_reshaped) + _reshape_weights_output.allocator()->allocate(); + if (!_are_weights_converted) + _converted_weights_output.allocator()->allocate(); + _is_prepared = true; + } + + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Reshape of the weights + if (!_are_weights_reshaped) + { + _reshape_weights_function.run(); + } + + // Convert weights if needed + if (!_are_weights_converted) + { + _convert_weights.run(); + } + + // Prepare GEMM prepare + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + } + + MemoryGroupResourceScope scope_mg(_memory_group); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + } + + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp.run(); + } + else + { + _mm_gemm.run(); + } + + // Accumulate biases if provided + if (_is_quantized) + { + _gemmlowp_output_stage.run(); + } + else + { + if (_accumulate_biases) + { + NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); + } + } +} + +void NEFullyConnectedLayerEx::prepare() +{ +#if 0 // TODO Remove this block + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + auto release_unused = [](Tensor *w) { + if (!w->is_used()) + { + w->allocator()->free(); + } + }; + + // Pointer to current weights + const ITensor *cur_weights = _original_weights; + + // Reshape of the weights (happens only once) + if (!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + _reshape_weights_output.allocator()->allocate(); + _reshape_weights_function.run(); + + cur_weights->mark_as_unused(); + cur_weights = &_reshape_weights_output; + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if (!_are_weights_converted) + { + _converted_weights_output.allocator()->allocate(); + _convert_weights.run(); + + cur_weights->mark_as_unused(); + _are_weights_converted = true; + } + + // Release reshaped weights if unused + release_unused(&_reshape_weights_output); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized) + { + _mm_gemm.prepare(); + } + + // Release converted weights if unused + release_unused(&_reshape_weights_output); + release_unused(&_converted_weights_output); + + _is_prepared = true; + } +#endif +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp new file mode 100644 index 000000000..dc6c78478 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h" + +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> +#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> + +using namespace arm_compute; + +void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input, + const arm_compute::ITensor *weights, + const arm_compute::ITensor *biases, + arm_compute::ITensor *output, bool needs_reshape, + const arm_compute::TensorShape &reshape, + KernelType kernel_type) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + const ITensor *input_to_use = input; + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _neon_reshape.configure(_input, &_neon_buffer); + input_to_use = &_neon_buffer; + } + + _neon_fc = [&]() { + if (kernel_type == KernelType::GENERAL) + { + auto fc = new arm_compute::NEFullyConnectedLayerEx{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); + + bool is_hybrid = input->info()->data_type() == DataType::F32 && + (weights->info()->data_type() == DataType::S8 || + weights->info()->data_type() == DataType::QASYMM8_SIGNED); + + if (is_hybrid) + { + auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager}; + ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info()); + const auto orgin_weights_data_type = weights_info->data_type(); + weights_info->set_data_type(DataType::QASYMM8_SIGNED); + fc->configure(input_to_use, _weights, _biases, _output); + weights_info->set_data_type(orgin_weights_data_type); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + else + { + auto fc = new arm_compute::NEFullyConnectedLayer{_memory_manager}; + fc->configure(input_to_use, _weights, _biases, _output); + return std::unique_ptr<arm_compute::IFunction>(fc); + } + } + }(); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + if (_needs_reshape) + { + _neon_buffer.allocator()->allocate(); + } +} + +void NEFullyConnectedReshapingLayer::run(void) +{ + if (_needs_reshape) + _neon_reshape.run(); + + _neon_fc->run(); +} + +void NEFullyConnectedReshapingLayer::prepare(void) { _neon_fc->prepare(); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp new file mode 100644 index 000000000..433c35d58 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEGatherEx.h" + +#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" +#include "support/MemorySupport.h" + +#include <utility> + +namespace arm_compute +{ +void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) +{ + auto k = support::cpp14::make_unique<NEGatherKernelEx>(); + k->configure(input, indices, output, axis); + _kernel = std::move(k); +} + +Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices, + const ITensorInfo *output, int axis) +{ + return NEGatherKernelEx::validate(input, indices, output, axis); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp new file mode 100644 index 000000000..52d58accf --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" + +#include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" +#include "support/MemorySupport.h" + +using namespace arm_compute; + +void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, + ITensor *output, ITensor *hits) +{ + auto k = support::cpp14::make_unique<NEHashtableLookupKernel>(); + k->configure(lookups, keys, input, output, hits); + _kernel = std::move(k); +} + +Status NEHashtableLookup::validate(const ITensorInfo *lookups, const ITensorInfo *keys, + const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *hits) +{ + return NEHashtableLookupKernel::validate(lookups, keys, input, output, hits); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp new file mode 100644 index 000000000..16d74e62d --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +namespace arm_compute +{ +NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() +{ +} + +void NEInstanceNormalizationLayerEx::configure(ITensor *input, ITensor *output, ITensor *gamma, + ITensor *beta, float epsilon) +{ + const DataLayout data_layout = input->info()->data_layout(); + + // Configure Kernels + _is_nchw = data_layout == DataLayout::NCHW; + + if (!_is_nchw) + { + _memory_group.manage(&_permuted_input); + _memory_group.manage(&_permuted_output); + + // Configure the function to transform the input tensor from NHWC -> NCHW + _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U)); + _permuted_input.info()->set_data_layout(DataLayout::NCHW); + + _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon); + _permuted_output.info()->set_data_layout(DataLayout::NCHW); + + _permute_output.configure(&_permuted_output, output != nullptr ? output : input, + PermutationVector(2U, 0U, 1U)); + _permuted_input.allocator()->allocate(); + _permuted_output.allocator()->allocate(); + } + else + { + _normalization_kernel.configure(input, output, gamma, beta, epsilon); + } +} + +Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *gamma, const ITensorInfo *beta, + float epsilon) +{ + return NEInstanceNormalizationLayerKernelEx::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); +} + +void NEInstanceNormalizationLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + // Permute input + if (!_is_nchw) + { + _permute_input.run(); + } + + NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ); + + // Permute output + if (!_is_nchw) + { + _permute_output.run(); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp new file mode 100644 index 000000000..275c55024 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NEOneHot.h" +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" +#include "support/MemorySupport.h" +#include <utility> +namespace arm_compute +{ +void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis) +{ + auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>(); + k->configure(indices, depth, on_value, off_value, output, axis); + _kernel = std::move(k); +} +Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp new file mode 100644 index 000000000..cb1a26304 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Tensor.h" + +using namespace arm_compute; + +NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output, ReductionOperation op) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output, ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], op); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], op); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceOperation::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp new file mode 100644 index 000000000..26a887912 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/NEON/functions/NEReduceSum.h" + +#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute; + +NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() +{ +} + +Status NEReduceSum::validate(const ITensorInfo *input, const Coordinates &reduction_axis, + bool keep_dims, const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(keep_dims); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); + + TensorShape out_shape = input->tensor_shape(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + const int input_dims = input->num_dimensions(); + Coordinates axis_local = reduction_axis; + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); + for (unsigned int i = 0; i < reduction_ops; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); + ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > + input->num_dimensions() - 1); + if (output->total_size() > 0 && keep_dims) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); + } + if (keep_dims) + { + out_shape.set(axis_local[i], 1); + } + else + { + out_shape.remove_dimension(axis_local[i] - i); + } + } + const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info); + + return Status{}; +} + +void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, + ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + + _reduction_ops = reduction_axis.num_dimensions(); + _reduction_kernels.resize(_reduction_ops); + _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0)); + _keep_dims = keep_dims; + + Coordinates axis_local = reduction_axis; + const int input_dims = input->info()->num_dimensions(); + const unsigned int reduction_ops = reduction_axis.num_dimensions(); + + // Convert negative axis + for (unsigned int i = 0; i < reduction_ops; ++i) + { + axis_local[i] = wrap_around(axis_local[i], input_dims); + } + + // Perform reduction for every axis + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + TensorShape out_shape = + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + out_shape.set(axis_local[i], 1); + auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); + + if (i == _reduction_ops - 1 && keep_dims) + { + _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::SUM); + } + else + { + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), + input->info()->data_type(), + input->info()->quantization_info()) + .set_data_layout(input->info()->data_layout())); + _memory_group.manage(&_reduced_outs[i]); + _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], + ReductionOperation::SUM); + } + } + + // Allocate intermediate tensors + for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + { + _reduced_outs[i].allocator()->allocate(); + } + + // Configure reshape layer if we want to drop the dimensions + if (!keep_dims) + { + TensorShape out_shape = input->info()->tensor_shape(); + + // We have to sort the reduction axis vectors in order for remove_dimension + // to work properly + std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + out_shape.remove_dimension(axis_local[i] - i); + } + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape)); + _reshape.configure(&_reduced_outs[_reduction_ops - 1], output); + } +} + +void NEReduceSum::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _reduction_ops; ++i) + { + _reduction_kernels[i].run(); + } + + if (!_keep_dims) + { + _reshape.run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp new file mode 100644 index 000000000..aa165cc15 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/UtilsEx.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ + +NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), + _conv_f(), + _upsample_f(), + _flip_weights(), + _scaled_output(), + _weights_flipped(), + _flip_axis(), + _original_weights(nullptr), + _input(nullptr), + _info(), + _is_prepared(false) +{ +} + +Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, + const ITensorInfo *bias, const ITensorInfo *output, + const PadStrideInfo &info, unsigned int invalid_right, + unsigned int invalid_bottom) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); + const unsigned int width_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); + + auto out_dims = transposeconv_output_dimensions( + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + if (bias != nullptr) + { + if (is_data_type_quantized_asymmetric(input->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + } + } + + if (output->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); + } + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, + pad_bottom); + TensorInfo scale_out_info( + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != + scale_out_info.dimension(batches_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != + scale_out_info.dimension(channel_idx)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, + conv_info, WeightsInfo())); + + return Status{}; +} + +void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const PadStrideInfo &info, + unsigned int invalid_right, unsigned int invalid_bottom) +{ + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); + + const DataLayout data_layout = input->info()->data_layout(); + const unsigned int width_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + auto out_dims = transposeconv_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); + + const TensorShape output_shape = + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + + _input = input; + _original_weights = weights; + _info = info; + _is_prepared = false; + + unsigned int pad_left = 0; + unsigned int pad_right = 0; + unsigned int pad_top = 0; + unsigned int pad_bottom = 0; + const unsigned int stride_x = info.stride().first; + const unsigned int stride_y = info.stride().second; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + + _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); + _memory_group.manage(&_scaled_output); + + _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); + _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); + + // setup the function to convolve the upscaled output + const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); + + const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); + + const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, + DimensionRoundingType::FLOOR); + + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); + scale_out_info.set_data_layout(data_layout); + _scaled_output.allocator()->init(scale_out_info); + + _upsample_f.configure(input, &_scaled_output, upsample_info); + + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer()); + axis_data[0] = static_cast<uint32_t>(width_idx); + axis_data[1] = static_cast<uint32_t>(height_idx); + + _scaled_output.allocator()->allocate(); +} + +void NETransposeConvLayer::run() +{ + prepare(); + + MemoryGroupResourceScope scope_mg(_memory_group); + + _upsample_f.run(); + _conv_f.run(); +} + +void NETransposeConvLayer::prepare() +{ + if (!_is_prepared) + { + ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); + + // Run weights flipping and mark original weights tensor as unused + _weights_flipped.allocator()->allocate(); + _flip_weights.run(); + _original_weights->mark_as_unused(); + + // Prepare convolution + _conv_f.prepare(); + + _is_prepared = true; + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/topk_v2.h b/compute/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..f94effea1 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file topk_v2.h + * @brief This file contains TopK method and TopContainer class for TopK operation + * @ingroup COM_AI_RUNTIME + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +/** + * @brief class to define TopK operation + * @note The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. + * TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than + * TFLite. + * (TFLite additionaly supports kTfLiteInt64.) + * + * The class that collects top indexes of k values. Based on template + * tensorflow::gtl::TopN<> but, for optimization, + * it re-uses the same container. + */ +template <typename T> class TopContainer +{ +public: + /** + * @brief Prevent default constructor of of this class + */ + TopContainer() = delete; + /** + * @brief Constructor with params + * @param [in] row_size Size of row in data + * @param [in] k The top k predictions + */ + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + */ + TopContainer(const TopContainer &) = delete; + /* + * @brief Prevent instances of this class from being copied (As this class contains pointers) + * @param [in] topContainer To copy + * @return Reference of TopContainer + */ + TopContainer &operator=(const TopContainer &) = delete; + + /** + * @brief Start collecting + * @param [in] values To set as values + * @return N/A + */ + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + /** + * @brief Push a value to be compared for topk + * @param [in] a A value to compare + * @return N/A + */ + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + /** + * @brief Get sorted result from pushed values + * @return Reference of vector with sorted values + */ + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +/** + * @brief Operates TopK operation with params + * @param [in] row_size Size of row in data + * @param [in] num_rows The number of rows in data + * @param [in] data To be operated in + * @param [in] k The top k predictions + * @param [out] output_indexes Indexes of targets in the top k predictions + * @param [out] output_values Values of targets in the top k predictions + * @return N/A + */ +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ |