diff options
Diffstat (limited to 'compute/ARMComputeEx')
154 files changed, 18413 insertions, 4604 deletions
diff --git a/compute/ARMComputeEx/CMakeLists.txt b/compute/ARMComputeEx/CMakeLists.txt index 58f558db2..c8d12c249 100644 --- a/compute/ARMComputeEx/CMakeLists.txt +++ b/compute/ARMComputeEx/CMakeLists.txt @@ -14,7 +14,7 @@ file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") # generate embeded cl_kernel execute_process ( WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" - COMMAND bash -c "python resolve_includes.py" + COMMAND bash -c "python3 resolve_includes.py" ) add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index d29886a9d..d3e116381 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -255,14 +255,14 @@ private: cl::Device _device; /**< Underlying CL device. */ std::string _kernel_path; /**< Path to the kernels folder. */ mutable std::map<std::string, const Program> - _programs_map; /**< Map with all already loaded program data. */ + _programs_map; /**< Map with all already loaded program data. */ mutable std::map<std::string, cl::Program> - _built_programs_map; /**< Map with all already built program data. */ + _built_programs_map; /**< Map with all already built program data. */ static const std::map<std::string, std::string> - _kernel_program_map; /**< Map that associates kernel names with programs. */ + _kernel_program_map; /**< Map that associates kernel names with programs. */ static const std::map<std::string, std::string> - _program_source_map; /**< Contains sources for all programs. - Used for compile-time kernel inclusion. >*/ + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h new file mode 100644 index 000000000..46d4ae858 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H +#define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H + +#include "src/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the reduction operation kernel + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerKernelEx : public ICLKernel +{ +public: + /** Default constructor */ + CLArgMinMaxLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx(const CLArgMinMaxLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLArgMinMaxLayerKernelEx &operator=(const CLArgMinMaxLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx(CLArgMinMaxLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CLArgMinMaxLayerKernelEx &operator=(CLArgMinMaxLayerKernelEx &&) = default; + /** Default destructor */ + ~CLArgMinMaxLayerKernelEx() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor of the previous iterations of @ref + * CLArgMinMaxLayerKernelEx. Data types supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[out] output Destination tensor. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + */ + void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, + unsigned int axis, ReductionOperation op); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerKernelEx. + * + * @param[in] input Source tensor info. Data types supported: S32/F16/F32. + * @param[in] prev_output Destination tensor info of the previous iterations. Data types + * supported: U32/S32 + * Has to be nullptr for the first iteration + * @param[in] output Destination tensor info. Data types supported: U32/S32 + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 + * @param[in] op Reduction operation to perform. Only ArgMin and ArgMax are supported. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_prev_output; + ICLTensor *_output; + unsigned int _reduction_axis; + ReductionOperation _op; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h index bb6fcb8f5..eac866b67 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -41,8 +41,8 @@ #ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ #define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/TypesEx.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h new file mode 100644 index 000000000..cf671102e --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * @file CLCastBoolKernel.h + * @ingroup COM_AI_RUNTIME + * @brief This file defines CLCastBoolKernel class + */ + +#ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ + +#include "src/core/CL/ICLSimple3DKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** + * @brief Class for the kernel converting boolean type + */ +class CLCastBoolKernel : public ICLSimple3DKernel +{ +public: + /** + * @brief Initialise the kernel's input and output. + * @param[in] input Input tensor. Data types supported: U8 + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @return N/A + */ + void configure(const ICLTensor *input, ICLTensor *output); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLCastBoolKernel + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h index a614d5259..6729fb0f1 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -47,15 +47,15 @@ #ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ #define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { class ICLTensor; /** -* @brief Class to perform EmbeddingLookup operation with opencl kernel -*/ + * @brief Class to perform EmbeddingLookup operation with opencl kernel + */ class CLEmbeddingLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h new file mode 100644 index 000000000..64908ab59 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H +#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H + +#include "src/core/CL/ICLKernel.h" + +namespace arm_compute +{ +/** Interface to add a bias to each row of the input tensor + * + */ +class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLGEMMMatrixAccumulateBiasesKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMMatrixAccumulateBiasesKernel & + operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types + * supported: Same as @p input + */ + void configure(ICLTensor *accum, const ICLTensor *biases); + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data + * types supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, ICLTensor *accum, + const ICLTensor *biases); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLGEMMMatrixAccumulateBiasesKernel + * + * @param[in] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types + * supported: Same as @p input + * @param[in] gpu_target GPU target + * + * @return a status + */ + static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_accum; + const ICLTensor *_biases; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h index 6630c7be7..a55f2401d 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ #define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h index 99cfa61ec..f9d6f7cc5 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ #define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" namespace arm_compute @@ -55,8 +55,8 @@ namespace arm_compute class ICLTensor; /** -* @brief Class to perform HashtableLookup operation with opencl kernel -*/ + * @brief Class to perform HashtableLookup operation with opencl kernel + */ class CLHashtableLookupKernel : public ICLKernel { public: diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h index f57e799ad..7da9e9a4c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h new file mode 100644 index 000000000..4befdd05c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H +#define ARM_COMPUTE_CLMEMSETKERNEL_H + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "src/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for filling the planes of a tensor */ +class CLMemsetKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLMemsetKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMemsetKernel(const CLMemsetKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMemsetKernel &operator=(const CLMemsetKernel &) = delete; + /** Allow instances of this class to be moved */ + CLMemsetKernel(CLMemsetKernel &&) = default; + /** Allow instances of this class to be moved */ + CLMemsetKernel &operator=(CLMemsetKernel &&) = default; + /** Default destructor */ + ~CLMemsetKernel() = default; + + /** Initialise the kernel's tensor and filling value + * + * @param[in,out] tensor Input tensor to fill. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default + * is nullptr. + */ + void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr); + /** Initialise the kernel's tensor and filling value + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Input tensor to fill. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default + * is nullptr. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *tensor, + const PixelValue &constant_value, Window *window = nullptr); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLMemsetKernel + * + * @param[in] tensor Source tensor info. Data types supported: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default is + * nullptr. + * + * @return a status + */ + static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, + Window *window = nullptr); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_tensor; + Window _full_window; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h index 90e8b5705..5394a062c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ #define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h index fa383c0d0..384050aff 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ #define __ARM_COMPUTE_CLNEGKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h new file mode 100644 index 000000000..1d64f9f7d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#define __ARM_COMPUTE_CLONEHOTKERNEL_H__ +#include "src/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +class ICLTensor; +/** Interface for the kernel to perform one-hot encoding*/ +class CLOneHotKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel(const CLOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHotKernel &operator=(const CLOneHotKernel &) = delete; + /** Allow instances of this class to be moved */ + CLOneHotKernel(CLOneHotKernel &&) = default; + /** Allow instances of this class to be moved */ + CLOneHotKernel &operator=(CLOneHotKernel &&) = default; + /** Default destructor */ + ~CLOneHotKernel() = default; + /** Initialise the kernel's inputs and output + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and output already initialized to off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel without off_value + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + /** Initialise the kernel's inputs and outputs internally + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure_common(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + int depth, int axis); + +private: + const ICLTensor *_indices; /**< Indices tensor */ + const ICLTensor *_on_value; /**< On value tensor */ + const ICLTensor *_off_value; /**< Off value tensor */ + ICLTensor *_output; /**< Destination tensor */ + bool _is_off_value_memset; /**< Whether off_value is zero */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h new file mode 100644 index 000000000..d4230aaf3 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H +#define ARM_COMPUTE_CLPADLAYERKERNELEX_H + +#include "src/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the PadLayer function. */ +class CLPadLayerKernelEx : public ICLKernel +{ +public: + /** Default constructor */ + CLPadLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default; + /** Default destructor */ + ~CLPadLayerKernelEx() = default; + /** Set the input and output tensor. + * + * @param[in] input Source tensor. Data types supported: U8, S8, QASYMM8, + * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Set the input and output tensor. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The + * pair padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPadLayerKernelEx + * + * @param[in] input Source tensor info. Data types supported: U8, S8, QASYMM8, + * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32. + * @param[in] output Output tensor info. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + int _input_start_x; + int _input_start_y; + bool _4d_enabled; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h index 4e1b56cba..3f60db7bb 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ #define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h index 9b8a239d3..548f29a27 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -47,8 +47,8 @@ #ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ #define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/TypesEx.h" +#include "src/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" namespace arm_compute { @@ -95,7 +95,7 @@ public: * @return N/A */ void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, - ReduceOperation op); + ReductionOperation op); /** * @brief Static function to check if given info will lead to a valid configuration of @ref @@ -108,7 +108,7 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ReduceOperation op); + ReductionOperation op); /* * @brief Run CLReduceOperationKernel op diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h index 4d4478ece..5f5b7f9b8 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ #define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h index aa4a14812..09073af7c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" // these parameters can be changed #define _ITEMS 16 // number of items in a group diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h b/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h deleted file mode 100644 index 28114f8b5..000000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/NEElementwiseOperationFuncs.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ -#define __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ - -#include <arm_neon.h> - -namespace arm_compute -{ -class ITensor; -class Window; -class QuantizationInfo; -} // namespace arm_compute - -namespace arm_compute -{ - -float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, - const float32x4_t &scale); - -void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, - const float32x4_t &invscale); - -float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale); - -void elementwise_op_quantized( - const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, - float32x4_t, float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, - int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)); - -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - float (*scalar_func)(const float &, const float &), - int (*broadcast_func)(int, int, int, const float *, const float &, float *, - const bool), - int (*neon_func)(int, int, int, const float *, const float *, float *)); - -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), - int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, - uint8_t *, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)); -} // namespace arm_compute -#endif // __ARM_COMPUTE_NEELEMENTWISEOPERATIONFUNCS_H__ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h deleted file mode 100644 index a827f48f8..000000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__ -#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/utils/misc/Traits.h" - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#include <arm_fp16.h> -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -namespace arm_compute -{ -class ITensor; - -/** Interface for the activation layer kernel. */ -class NEActivationLayerKernelEx : public INEKernel -{ -public: - const char *name() const override { return "NEActivationLayerKernelEx"; } - /** Constructor */ - NEActivationLayerKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEActivationLayerKernelEx(const NEActivationLayerKernelEx &) = delete; - /** Default move constructor */ - NEActivationLayerKernelEx(NEActivationLayerKernelEx &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEActivationLayerKernelEx &operator=(const NEActivationLayerKernelEx &) = delete; - /** Default move assignment operator */ - NEActivationLayerKernelEx &operator=(NEActivationLayerKernelEx &&) = default; - /** Set the input and output tensor. - * - * @note If the output tensor is a nullptr, the activation function will be performed in-place - * - * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this - * tensor will store the result - * of the activation function. Data types supported: - * QASYMM8/QSYMM16/F16/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] activation_info Activation layer information. - */ - void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEActivationLayerKernelEx - * - * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor - * will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfo &act_info); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - using ActivationFunction = ActivationLayerInfo::ActivationFunction; - /** Common signature for all the specialised @ref NEActivationLayerKernelEx functions - * - * @param[in] window Region on which to execute the kernel. - */ - using ActivationFunctionExecutorPtr = void (NEActivationLayerKernelEx::*)(const Window &window); - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template <ActivationLayerInfo::ActivationFunction F, typename T> - typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type - activation(const Window &window); - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template <ActivationLayerInfo::ActivationFunction F, typename T> - typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type - activation(const Window &window); - /** Function to apply an activation function on a tensor. - * - * @param[in] window Region on which to execute the kernel - */ - template <ActivationLayerInfo::ActivationFunction F, typename T> - typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type - activation(const Window &window); - -private: - ITensor *_input; - ITensor *_output; - ActivationFunctionExecutorPtr _func; - ActivationLayerInfo _act_info; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h deleted file mode 100644 index 8c544cda8..000000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ -#define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ - -#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ - -class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel -{ -public: - /** Default destructor */ - ~NEBinaryLogicalOperationKernel() = default; - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEBinaryLogicalOperationKernel - * - * @param[in] op Binary logical operation to be executed. - * @param[in] input1 First tensor input. Data types supported: QASYMM8/U8. - * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[in] output Output tensor. Data types supported: Same as @p input1. - */ - void configure(BinaryLogicalOperation op, const ITensor *input1, const ITensor *input2, - ITensor *output); - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEBinaryLogicalOperationKernel - * - * @param[in] op Binary logical operation to be executed. - * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. - * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. - * @param[in] output Output tensor info. Data types supported: Same as @p input1. - * - * @return a Status - */ - static Status validate(BinaryLogicalOperation op, const ITensorInfo *input1, - const ITensorInfo *input2, const ITensorInfo *output); - -protected: - // Inherited methods overridden: - static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, - const ITensorInfo &output); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h index 1693922b7..036d56e69 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReductionOperationEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,63 +37,58 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__ +#define __ARM_COMPUTE_NECASTBOOLKERNEL_H__ -#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ -#define __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" -#include "arm_compute/core/TypesEx.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { class ITensor; -/** Basic function to simulate a reduction operation. This function calls the following NEON - * kernels: - * - * -# @ref NEFillBorderKernel - * -# @ref NEReductionOperationKernelEx - * +/** + * @brief Class for the kernel converting boolean type */ -class NEReductionOperationEx : public IFunction +class NECastBoolKernel : public INEKernel { public: - /** Default constructor */ - NEReductionOperationEx(); - /** Set the input and output tensors. + const char *name() const override { return "NECastBoolKernel"; } + /** Default constructor*/ + NECastBoolKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel(const NECastBoolKernel &) = delete; + /** Default move constructor */ + NECastBoolKernel(NECastBoolKernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECastBoolKernel &operator=(const NECastBoolKernel &) = delete; + /** Default move assignment operator */ + NECastBoolKernel &operator=(NECastBoolKernel &&) = default; + /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. - * @param[in] axis Dimension along which to reduce. - * @param[in] op Reduction operation to perform. + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. */ - void configure(ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); - + void configure(const ITensor *input, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref - * NEReductionOperationEx. + * NECastBoolKernel * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p - * input. - * @param[in] axis Dimension along which to reduce. - * @param[in] op Reduction operation to perform. + * @param[in] input Source tensor info. Data types supported: U8 + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, - ReduceOperation op); + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: - void run() override; + void run(const Window &window, const ThreadInfo &info) override; private: - NEReductionOperationKernelEx _reduction_kernel; - NEFillBorderKernel _fill_border_kernel; - size_t _window_split; - int _reduction_axis; + const ITensor *_input; + ITensor *_output; }; } // namespace arm_compute -#endif /* __ARM_COMPUTE_NEREDUCTIONOPERATIONEX_H__ */ +#endif /*__ARM_COMPUTE_NECASTBOOLKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h index 88f21c96e..621500eb8 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ #define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h index 6e8bdc1c2..f8f7ac567 100644 --- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2019 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,74 +37,56 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H +#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H -#ifndef __ARM_COMPUTE_CPPONEHOTERNEL_H__ -#define __ARM_COMPUTE_CPPONEHOTERNEL_H__ - -#include "arm_compute/core/CPP/ICPPKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { class ITensor; - -/** CPP kernel to perform tensor OneHot operation. */ -class CPPOneHotKernelEx : public ICPPKernel +/** NEON kernel to add a bias to each row of the input tensor */ +class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel { public: - const char *name() const override { return "CPPOneHotKernelEx"; } + const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; } /** Default constructor */ - CPPOneHotKernelEx(); + NEGEMMMatrixAccumulateBiasesKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - CPPOneHotKernelEx(const CPPOneHotKernelEx &) = delete; + NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - CPPOneHotKernelEx &operator=(const CPPOneHotKernelEx &) = delete; + NEGEMMMatrixAccumulateBiasesKernel & + operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; /** Allow instances of this class to be moved */ - CPPOneHotKernelEx(CPPOneHotKernelEx &&) = default; + NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default; /** Allow instances of this class to be moved */ - CPPOneHotKernelEx &operator=(CPPOneHotKernelEx &&) = default; + NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default; /** Default destructor */ - ~CPPOneHotKernelEx() = default; - - /** Set the input and output of the kernel. + ~NEGEMMMatrixAccumulateBiasesKernel() = default; + /** Set the accumulate buffer and the biases of the kernel. * - * @param[in] indices A tensor for indices. Data types supported: S32 - * @param[in] depth A tensor for depth. Data types supported: S32 - * @param[in] on_value A tensor for on_value. Data types supported: F32 - * @param[in] off_value A tensor for off_value. Data types supported: F32* - * @param[out] output A tensor for computed value of one hot operator - * @param[in] axis An int value for axis + * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32 + * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type + * supported: Same as @p input */ - void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, - const ITensor *off_value, ITensor *output, const int axis); - + void configure(ITensor *accum, const ITensor *biases); /** Static function to check if given info will lead to a valid configuration of @ref - * CPPOneHotKernelEx + * NEGEMMMatrixAccumulateBiasesKernel * - * @param[in] indices A tensor for indices. Data types supported: S32 - * @param[in] depth A tensor for depth. Data types supported: S32 - * @param[in] on_value A tensor for on_value. Data types supported: F32 - * @param[in] off_value A tensor for off_value. Data types supported: F32* - * @param[in] axis An int value for axis + * @param[in] accum The accumulate tensor to convert. Data type supported: F32 + * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type + * supported: Same as @p input * * @return a status */ - static Status validate(const ITensor *indices, const ITensor *depth, const ITensor *on_value, - const ITensor *off_value, const int axis); + static Status validate(const ITensorInfo *accum, const ITensorInfo *biases); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - bool is_parallelisable() const override; private: - /** Template function to run the topKV operation. */ - template <typename T> void run_one_hot(); - - const ITensor *_indices; - const ITensor *_depth; - const ITensor *_on_value; - const ITensor *_off_value; - ITensor *_output; - int _axis; + ITensor *_accum; + const ITensor *_biases; }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_CPPONEHOTKERNEL_H__ */ +#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h index e765aa489..a03e08ade 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ #define __ARM_COMPUTE_NEGATHERKERNELEX_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute @@ -126,6 +126,7 @@ private: const ITensor *_input; const ITensor *_indices; int _axis; + size_t _indices_rank; ITensor *_output; kernel_ptr _func; }; diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h index cb2a485d5..fb3a72725 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ #define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h index 8724cc69b..1d786b59e 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ #define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h index 198b0be9d..ab534fe96 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ #define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h new file mode 100644 index 000000000..c1c9f7a3c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#define __ARM_COMPUTE_NEONEHOTKERNEL_H__ +#include "src/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +/** Kernel to perform other operation on NEON */ +class NEOneHotKernel : public INEKernel +{ +public: + /** Default constructor. */ + NEOneHotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel(const NEOneHotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + NEOneHotKernel &operator=(const NEOneHotKernel &) = delete; + /** Allow instances of this class to be moved. */ + NEOneHotKernel(NEOneHotKernel &&) = default; + /** Allow instances of this class to be moved. */ + NEOneHotKernel &operator=(NEOneHotKernel &&) = default; + /** Default detructor */ + ~NEOneHotKernel() = default; + /** Name of the kernel + * + * @return Kernel name + */ + const char *name() const override { return "NEOneHotKernel"; } + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. + * Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Implementation of the onehot operation for 0 axis. + * + * For onehot on the 0 axis an element by element copy is performed. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_0_axis(const Window &window, const ThreadInfo &info); + /** Implementation of the onehot operation. + * + * For 1<=axis a row-wise copy is taking place. + * + * @param[in] window Region on which to execute the kernel. (Must be a region of the window + * returned by window()) + * @param[in] info Info about executing thread and CPU. + */ + template <typename U> void onehot_n_axis(const Window &window, const ThreadInfo &info); + using kernel_ptr = void (NEOneHotKernel::*)(const Window &window, const ThreadInfo &info); + const ITensor *_indices; + const ITensor *_depth; + const ITensor *_on_value; + const ITensor *_off_value; + int _axis; + ITensor *_output; + kernel_ptr _func; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOTKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h index 0b080cf73..1fd5362ae 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ #define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h deleted file mode 100644 index c9024fbb3..000000000 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ -#define __ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/TypesEx.h" - -namespace arm_compute -{ -class ITensor; - -/** NEON kernel to perform a reduction operation */ -class NEReductionOperationKernelEx : public INEKernel -{ -public: - const char *name() const override { return "NEReductionOperationKernelEx"; } - /** Default constructor */ - NEReductionOperationKernelEx(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEReductionOperationKernelEx(const NEReductionOperationKernelEx &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEReductionOperationKernelEx &operator=(const NEReductionOperationKernelEx &) = delete; - /** Allow instances of this class to be moved */ - NEReductionOperationKernelEx(NEReductionOperationKernelEx &&) = default; - /** Allow instances of this class to be moved */ - NEReductionOperationKernelEx &operator=(NEReductionOperationKernelEx &&) = default; - /** Default destructor */ - ~NEReductionOperationKernelEx() = default; - - /** Set the source, destination of the kernel - * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. Data layouts supported: - * NCHW. - * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 - * @param[in] op Reduction operation to perform. - */ - void configure(const ITensor *input, ITensor *output, unsigned int axis, ReduceOperation op); - - /** Static function to check if given info will lead to a valid configuration of @ref - * NEReductionOperationKernelEx. - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. Data layouts - * supported: NCHW. - * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p - * input. - * Output will have the same number of dimensions as input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 - * @param[in] op Reduction operation to perform. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, - ReduceOperation op); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - BorderSize border_size() const override; - -private: - const ITensor *_input; - ITensor *_output; - unsigned int _reduction_axis; - ReduceOperation _op; - BorderSize _border_size; -}; -} // namespace arm_compute -#endif /*__ARM_COMPUTE_NEREDUCTIONOPERATIONKERNELEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/TypesEx.h b/compute/ARMComputeEx/arm_compute/core/TypesEx.h index faba8a449..cda8a30b1 100644 --- a/compute/ARMComputeEx/arm_compute/core/TypesEx.h +++ b/compute/ARMComputeEx/arm_compute/core/TypesEx.h @@ -51,15 +51,6 @@ enum class ArgOperation MIN, }; -/** Available reduce operations */ -enum class ReduceOperation -{ - MAX, /**< Max */ - MEAN, /**< Mean */ - SUM, /**< Sum */ - MIN, /**< Min */ -}; - /** Available binary logical operations */ enum class BinaryLogicalOperation { diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h index d57e8fcf5..d7ec1b4f0 100644 --- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_top); -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h index a9ceacbea..2aaab6b3a 100644 --- a/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h +++ b/compute/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h @@ -72,10 +72,10 @@ namespace shape_calculator * @return the calculated shape */ inline TensorShape compute_transposeconv_upsampled_shape( - const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, - std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, - unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, - unsigned int &pad_top, unsigned int &pad_bottom) + const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &info, + std::pair<unsigned int, unsigned int> &out_dims, unsigned int invalid_right, + unsigned int invalid_bottom, unsigned int &pad_left, unsigned int &pad_right, + unsigned int &pad_top, unsigned int &pad_bottom) { unsigned int sx = info.stride().first; unsigned int sy = info.stride().second; @@ -103,7 +103,7 @@ inline TensorShape compute_transposeconv_upsampled_shape( unsigned int padx_all_except_invallid = padx + info.pad_left() + info.pad_right() - invalid_right; unsigned int pady_all_except_invallid = - pady + info.pad_top() + info.pad_bottom() - invalid_bottom; + pady + info.pad_top() + info.pad_bottom() - invalid_bottom; pad_left = (padx_all_except_invallid + 1) / 2 - info.pad_left(); pad_right = pady_all_except_invallid / 2 - info.pad_right() + invalid_right; pad_top = (padx_all_except_invallid + 1) / 2 - info.pad_top(); @@ -135,7 +135,7 @@ compute_transposeconv_output_shape(const std::pair<unsigned int, unsigned int> & const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const int batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); TensorShape out_shape{input_shape}; @@ -160,7 +160,7 @@ inline TensorShape compute_depth_to_space_shape_ex(const ITensorInfo *input, int const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int idx_channel = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); TensorShape output_shape{input->tensor_shape()}; output_shape.set(idx_width, input->dimension(idx_width) * block); @@ -238,6 +238,36 @@ inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape, return output_shape; } +/** Calculate the gather output shape of a tensor + * + * @param[in] input_shape Input tensor shape + * @param[in] indices_shape Indices tensor shape + * @param[in] actual_axis The axis to be gathered + * + * @return the calculated shape + */ +inline TensorShape compute_onehot_shape_ex(const TensorShape &indices_shape, uint32_t depth, + uint32_t actual_axis) +{ + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); + ARM_COMPUTE_ERROR_ON(actual_axis > indices_shape.num_dimensions()); + + TensorShape output_shape; + output_shape.set(actual_axis, depth); + + unsigned int i_shift = 0; + for (unsigned int i = 0; i < indices_shape.num_dimensions(); ++i) + { + if (i == actual_axis) + { + i_shift++; + } + output_shape.set(i + i_shift, indices_shape[i]); + } + + return output_shape; +} + } // namespace shape_calculator } // namespace misc } // namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h index cfbd13436..664b8b3b1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -16,14 +16,19 @@ #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__ #define __ARM_COMPUTE_CLFUNCTIONSEX_H__ +#include <arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h> #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h> +#include <arm_compute/runtime/CL/functions/CLCastBool.h> #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h> #include <arm_compute/runtime/CL/functions/CLGatherEx.h> #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h> #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> #include <arm_compute/runtime/CL/functions/CLNeg.h> +#include <arm_compute/runtime/CL/functions/CLOneHot.h> +#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h> #include <arm_compute/runtime/CL/functions/CLReduceOperation.h> +#include <arm_compute/runtime/CL/functions/CLSplitVEx.h> #include <arm_compute/runtime/CL/functions/CLTopKV2.h> #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h> diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h new file mode 100644 index 000000000..05bcc4075 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ +#define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ + +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" + +namespace arm_compute +{ +class ITensorInfo; +class ICLTensor; + +/** Function to calculate the index of the minimum or maximum values in a + * tensor based on an axis. + * + * @note The default data type for an uninitialized output tensor is + * signed 32-bit integer (S32). It is the user's responsibility to check + * that the results do not overflow because the indices are computed + * in unsigned 32-bit (U32). + */ +class CLArgMinMaxLayerEx : public IFunction +{ +public: + /** Default Constructor. + * + * @param[in] memory_manager (Optional) Memory manager. + */ + CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input and output tensors. + * + * @param[in] input Input source tensor. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[out] output Output source tensor. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + */ + void configure(const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLArgMinMaxLayerEx + * + * @param[in] input Input source tensor info. Data types supported: QASYMM8/F16/F32. + * @param[in] axis Axis to find max/min index. + * @param[in] output Output source tensor info. Data types supported: U32/S32. + * @param[in] op Reduction operation to perform. Operations supported: ARG_IDX_MAX, + * ARG_IDX_MIN + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::vector<CLTensor> _results_vector; + CLTensor _not_reshaped_output; + std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector; + CLReshapeLayer _reshape_kernel; + unsigned int _num_of_stages; + unsigned int _reduction_axis; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h index 88a9b00ec..fc4322798 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -43,6 +43,7 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/core/TypesEx.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h index 7930e4e20..854ddce52 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPOneHotEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -38,31 +38,34 @@ * SOFTWARE. */ -#ifndef __ARM_COMPUTE_CPPONEHOT_EX_H__ -#define __ARM_COMPUTE_CPPONEHOT_EX_H__ +/** + * @file CLCastBool.h + * @ingroup COM_AI_RUNTIME + * @brief This file contains arm_compute::CLCastBool class + */ + +#ifndef ARM_COMPUTE_CLCASTBOOL_H +#define ARM_COMPUTE_CLCASTBOOL_H -#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" -#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { -class ITensor; +class ICLTensor; -/** Basic function to run @ref CPPOneHot */ -class CPPOneHotEx : public ICPPSimpleFunction +/** + * @brief Class to run @ref CLCastBoolKernel. + * This converts the boolean input tensor to the output tensor's type. + */ +class CLCastBool : public ICLSimpleFunction { public: - /** Configure the one_hot function - * - * @param[in] indices A tensor for indices. Data types supported: S32 - * @param[in] depth A tensor for depth. Data types supported: S32 - * @param[in] on_value A tensor for on_value. Data types supported: F32 - * @param[in] off_value A tensor for off_value. Data types supported: F32 - * @param[out] output A tensor for computed value of one hot operator - * @param[in] axis An int value for axis + /** + * @brief Initialise the kernel's input and output + * @param[in] input Input tensor. Data types supported: U8 + * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/F16/F32. */ - void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, - const ITensor *off_value, ITensor *output, const int axis); + void configure(ICLTensor *input, ICLTensor *output); }; -} -#endif /* __ARM_COMPUTE_CPPONEHOT_EX_H__ */ +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLCASTBOOL_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h index 409eaf593..026209f69 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h @@ -106,22 +106,24 @@ public: CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default; /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this - * is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[out] output Output tensor. + * The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -130,23 +132,24 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. * @param[in] bias (Optional) The biases have one dimension. * Data type supported: Should match @p input data type, except for - * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * the @p input. * @param[in] info Contains padding and policies to be used in the deconvolution, - * this is decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -154,24 +157,26 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLDirectTransposeConvLayer + * CLDirectTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. - * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. - * Data type supported: Should match @p input data type, except for input - * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Should match @p input data type, + * except for input of QASYMM8 and QASYMM8_SIGNED type + * where biases should be of S32 type + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped + * with @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h index fbee7e40e..b0149cb09 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -73,5 +73,5 @@ public: */ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h index f3266f688..c75ae9a50 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -43,14 +43,14 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" -#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" namespace arm_compute { @@ -182,5 +182,5 @@ private: bool _is_prepared; const ICLTensor *_original_weights; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h index e65a646dc..c08da526a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -43,16 +43,14 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" namespace arm_compute { @@ -132,9 +130,6 @@ private: * transpose_weights is set to true ) (called once) * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized * asymmetric) - * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref - * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is - * not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ @@ -157,40 +152,36 @@ public: * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor. The weights must be 2 dimensional. * If this function is called after a Convolution Layer, the (transposed) - * weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) - * weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. + * weights will have as many rows as the product of the first 3 input's dimensions. If it is + * called after another FullyConnected Layer, the (transposed) weights will have as many rows as + * the input's first dimension. Data type supported: Same as @p input. * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix * multiplication between: * - The output of im2col on the input and the (transposed) 2D weights, if the * function is called after a Convolution Layer * - The input tensor and the (transposed) 2D weights, if the function is - * called after another FullyConnected Layer. - * Data type supported: Same as @p input. + * called after another FullyConnected Layer. Data type supported: Same as @p input. * @param[in] fc_info (Optional) Fully connected layer additional info */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLFullyConnectedLayerEx + * CLFullyConnectedLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor info. The weights must be 2 dimensional. * If this function is called after a Convolution Layer, the (transposed) - * weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) - * weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. + * weights will have as many rows as the product of the first 3 input's dimensions. If it is + * called after another FullyConnected Layer, the (transposed) weights will have as many rows as + * the input's first dimension. Data type supported: Same as @p input. * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor info. Its shape should be equal to the output of a * matrix multiplication between: * - The output of im2col on the input and the (transposed) 2D weights, if the * function is called after a Convolution Layer * - The input tensor and the (transposed) 2D weights, if the function is - * called after another FullyConnected Layer. - * Data type supported: Same as @p input. + * called after another FullyConnected Layer. Data type supported: Same as @p input. * @param[in] fc_info (Optional) Fully connected layer additional info * * @return a status @@ -216,7 +207,7 @@ private: CLConvertFullyConnectedWeights _convert_weights; weights_transformations::CLConvertFullyConnectedWeightsManaged _convert_weights_managed; weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged - _reshape_weights_managed_function; + _reshape_weights_managed_function; CLFlattenLayer _flatten_layer; CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; CLGEMM _mm_gemm; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h index 289ab167f..bdb168664 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: CLFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, - _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) + : _input(nullptr), _weights(nullptr), _biases(nullptr), _output(nullptr), _cl_buffer{}, + _memory_manager{memory_manager}, _cl_fc{nullptr}, _cl_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h index b01ec4255..385eb0b2c 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -47,11 +47,14 @@ #ifndef __ARM_COMPUTE_CLGATHEREX_H__ #define __ARM_COMPUTE_CLGATHEREX_H__ +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** * @brief Class to to run @ref CLGatherKernel. @@ -66,7 +69,7 @@ public: * @param[out] output The output tensor, Data types supported: same as @p input. * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 * @return N/A - */ + */ void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0); /** @@ -81,5 +84,5 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h index 6618f5aa4..5e172a4c7 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -78,5 +78,5 @@ public: void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, ICLTensor *output, ICLTensor *hits); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h index 887e7aaa5..02ae6d719 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -41,11 +41,14 @@ #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to perform a Instance normalization. * diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h new file mode 100644 index 000000000..62a36f06d --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLONEHOT_H__ +#define __ARM_COMPUTE_CLONEHOT_H__ + +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; +/** Basic function to run @ref CLOneHotKernel */ +class CLOneHot : public IFunction +{ +public: + /** Constructor */ + CLOneHot(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot(const CLOneHot &) = delete; + /** Default move constructor */ + CLOneHot(CLOneHot &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLOneHot &operator=(const CLOneHot &) = delete; + /** Default move assignment operator */ + CLOneHot &operator=(CLOneHot &&) = default; + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, const ICLTensor *off_value, + ICLTensor *output, int depth, int axis = -1); + /** Initialise the kernel's inputs and outputs with off_value being constant + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] off_value The PixelValue for off value. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + */ + void configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLOneHotKernel + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[in] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] depth The depth of the one hot dimension. + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis = -1); + + // Inherited methods overridden: + void run() override; + +private: + CLMemsetKernel _memset_kernel; /**< Memset kernel */ + CLOneHotKernel _onehot_kernel; /**< OneHot kernel */ + bool _has_to_memset; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h new file mode 100644 index 000000000..ee1879aaa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLPADLAYEREX_H +#define ARM_COMPUTE_CLPADLAYEREX_H + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" +#include "src/core/gpu/cl/kernels/ClCopyKernel.h" +// #include "arm_compute/runtime/CL/functions/CLCopy.h" +#include <memory> + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels: + * + * -# @ref CLPadLayerKernelEx if there is padding to be added + * -# @ref CLCopyKernel otherwise + */ +class CLPadLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLPadLayerEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerEx(const CLPadLayerEx &) = delete; + /** Default move constructor */ + CLPadLayerEx(CLPadLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerEx &operator=(const CLPadLayerEx &) = delete; + /** Default move assignment operator */ + CLPadLayerEx &operator=(CLPadLayerEx &&) = default; + + /** Initialize the function + * + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The + * pair padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPadLayerEx. + * + * @param[in] input Source tensor info. Data types supported: All. + * @param[in] output Output tensor info. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + // Inherited methods overridden: + void run() override; + +private: + void configure_reflect_mode(ICLTensor *input, ICLTensor *output); + + std::unique_ptr<CLPadLayerKernelEx> _pad_kernel; + std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel; + bool _perform_pad; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLPADLAYEREX_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h index 7dba84b12..45eb72bef 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -48,7 +48,7 @@ #define __ARM_COMPUTE_CLREDUCEOPERATION_H__ #include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" -#include "arm_compute/core/TypesEx.h" +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" @@ -82,7 +82,7 @@ public: * @return N/A */ void configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, - bool keep_dims, ReduceOperation op); + bool keep_dims, ReductionOperation op); /** * @brief Static function to check if given info will lead to a valid configuration of @ref @@ -96,7 +96,8 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const std::set<uint32_t> &axis, bool keep_dims, const ReduceOperation &op); + const std::set<uint32_t> &axis, bool keep_dims, + const ReductionOperation &op); /** * @brief Run the OpenCL kernel for this operation @@ -115,5 +116,5 @@ private: std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; CLReshapeLayer _reshape; }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h new file mode 100644 index 000000000..3023df3f0 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLSPLITVEX__ +#define __ARM_COMPUTE_CLSPLITVEX__ + +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/CL/functions/CLSlice.h" +#include "arm_compute/core/Types.h" +#include <vector> +#include <memory> + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/runtime/CPP/functions/CPPSplit.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLSplitVKernel */ +class CLSplitVEx : public IFunction +{ +public: + /** Default constructor */ + CLSplitVEx(); + /** Configure the split CL kernel + * + * @param[in] input The input tensor to split. Data types supported: + * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] size_splits A 1-D tensor containing the number of tensor values per split + * @param[out] outputs A vector containing the output tensor. Data types supported: Same as @p + * input + * The output tensors should match the input tensor dimensions for all + * shape dimensions apart + * from the split dimension. + * @param[in] split_dim Integer value representing the input tensor dimension along which to + * split + * @param[in] num_splits Number of splits + */ + void configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits); + + void run() override; + +private: + const ICLTensor *_input; + const ICLTensor *_size_splits; + std::vector<ICLTensor *> _outputs; + unsigned int _num_splits; + std::vector<CLSlice> _slice_functions; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLSPLITVEX__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h index e301a5152..f426a4d75 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -160,5 +160,5 @@ private: CLTopKV2Store _store_kernel; #endif }; -} +} // namespace arm_compute #endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h index 5fb102e47..5b27d362a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h @@ -63,20 +63,22 @@ public: /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same - * as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this - * is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, @@ -85,22 +87,22 @@ public: /** Set the input, weights, biases and output tensors. * * @param[in] compile_context The compile context to be used. - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and - * an optional 4th dimension for batch of inputs. Data types supported: - * QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: - * Same as @p input. - * @param[out] output Output tensor. The output has the same number of dimensions as - * the @p input. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions + * as the @p input. * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, - * this is described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref - * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref - * CLWeightsReshapeKernel. + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for + * @ref CLConvolutionLayer, specifies if the weights tensor has + * been reshaped with @ref CLWeightsReshapeKernel. * */ void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, @@ -108,22 +110,24 @@ public: unsigned int invalid_right, unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLTransposeConvLayer + * CLTransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data - * type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Same as - * @p input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the - * @p input. - * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, this is - * described in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. - * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, - * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data type supported: Same as @p input. + * @param[in] output Output tensor info. The output has the same number of dimensions + * as the @p input. + * @param[in] deconv_info Contains padding and policies to be used in the deconvolution, + * this is described in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, + * specifies if the weights tensor has been reshaped with + * @ref CLWeightsReshapeKernel. * * @return a status */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h index 3fad230f1..d0ddc2609 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -16,13 +16,13 @@ #ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ #define __ARM_COMPUTE_NEFUNCTIONSEX_H__ -#include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h> -#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> +#include <arm_compute/runtime/NEON/functions/NECastBool.h> #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> #include <arm_compute/runtime/NEON/functions/NEGatherEx.h> #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h> #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h> +#include <arm_compute/runtime/NEON/functions/NEOneHot.h> #include <arm_compute/runtime/NEON/functions/NEReduceSum.h> #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h> #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h> diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h deleted file mode 100644 index 6156c84f8..000000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEActivationLayerEx.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__ -#define __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__ - -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -#include "arm_compute/core/Types.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; - -/** Basic function to run @ref NEActivationLayerKernelEx - * - * @note The function simulates an activation layer with the specified activation function. - */ -class NEActivationLayerEx : public INESimpleFunctionNoBorder -{ -public: - /** Constructor - * - * @param[in] ctx Runtime context to be used by the function - */ - NEActivationLayerEx(IRuntimeContext *ctx = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEActivationLayerEx(const NEActivationLayerEx &) = delete; - /** Default move constructor */ - NEActivationLayerEx(NEActivationLayerEx &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEActivationLayerEx &operator=(const NEActivationLayerEx &) = delete; - /** Default move assignment operator */ - NEActivationLayerEx &operator=(NEActivationLayerEx &&) = default; - /** [NEActivationLayerEx snippet] **/ - /** Set the input and output tensor. - * - * @note If the output tensor is a nullptr or is equal to the input, the activation function will - * be performed in-place - * - * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this - * tensor will store the result - * of the activation function. Data types supported: - * QASYMM8/QSYMM16/F16/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] activation_info Activation layer parameters. - */ - void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); - /** [NEActivationLayerEx snippet] **/ - /** Static function to check if given info will lead to a valid configuration of @ref - * NEActivationLayerEx - * - * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor - * will store the result - * of the activation function. Data types supported: QASYMM8/QSYMM16/F16/F32. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * @param[in] act_info Activation layer information. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfo &act_info); -}; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEACTIVATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h deleted file mode 100644 index 026d30098..000000000 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ -#define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ - -#include "arm_compute/core/TypesEx.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" - -namespace arm_compute -{ -class ITensor; - -/** Basic function to run @ref NEBinaryLogicalOperationKernel. - * - * @note The tensor data type for the inputs must be QASYMM8/U8. - * @note The function performs a binary logical operation between two tensors. - */ -class NEBinaryLogicalOperation : public INESimpleFunction -{ -public: - /** Initialise the kernel's inputs, output and conversion policy. - * - * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8. - * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[out] output Output tensor. Data types supported: Same as @p input1. - * @param[in] op Binary Logical Operation to be performed. - */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEBinaryLogicalOperationKernel - * - * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8. - * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. - * @param[in] output Output tensor info. Data types supported: Same as @p input1. - * @param[in] op Binary Logical Operation to be performed. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, BinaryLogicalOperation op); -}; - -/** Basic function to run @ref NEBinaryLogicalOperationKernel - * - * @note The tensor data type for the inputs must be QASYMM8/U8. - * @note The function performs a binary logical operation between two tensors. - */ -template <BinaryLogicalOperation op> class NEBinaryLogicalOperationStatic : public INESimpleFunction -{ -public: - /** Initialise the kernel's inputs, output and conversion policy. - * - * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/U8 - * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. - * @param[out] output Output tensor. Data types supported: Same as @p input1. - */ - void configure(ITensor *input1, ITensor *input2, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEBinaryLogicalOperationKernel - * - * @param[in] input1 First tensor input info. Data types supported: QASYMM8/U8 - * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. - * @param[in] output Output tensor info. Data types supported: Same as @p input1. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output); -}; - -/** Basic function to run equal comparison. */ -using NELogicalAnd = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; -/** Basic function to run not equal comparison. */ -using NELogicalOr = NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; -} // namespace arm_compute -#endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h new file mode 100644 index 000000000..dd62645ee --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECASTBOOL_H__ +#define __ARM_COMPUTE_NECASTBOOL_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; + +/** + * @brief Class to run @ref INESimpleFunctionNoBorder. + */ +class NECastBool : public INESimpleFunctionNoBorder +{ +public: + /** Initialize the function's source, destination + * + * Valid conversions Input -> Output : + * + * - U8 -> U8, S8, U16, S16, U32, S32, F32, F16 + * + * @param[in] input The input tensor to convert. Data types supported: U8 + * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + */ + void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NECastBool + * + * @param[in] input Source tensor info. Data types supported: U8. + * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NECASTBOOL_H__*/ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h index 63f7714aa..82a789e86 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -48,12 +48,14 @@ #define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/core/Error.h" #include <vector> namespace arm_compute { class ITensor; +class ITensorInfo; /** * @brief Class to perform EmbeddingLookup operation @@ -84,5 +86,5 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *lookups); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h index 56548a479..214592710 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -44,11 +44,11 @@ #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" -#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" #include "arm_compute/runtime/Tensor.h" +#include "src/core/NEON/kernels/NETransposeKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h index 8f98f220a..2bbb1fea1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -43,16 +43,16 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" -#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "src/core/NEON/kernels/NETransposeKernel.h" namespace arm_compute { @@ -79,11 +79,11 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; /** Default move constructor */ - NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; /** Default move assignment operator */ - NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete; /** Set the input and output tensors. * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. @@ -141,7 +141,7 @@ private: void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); MemoryGroup _memory_group; - NEFlattenLayerKernel _flatten_kernel; + NEFlattenLayer _flatten_kernel; NEConvertFullyConnectedWeights _convert_weights; NEFullyConnectedLayerReshapeWeights _reshape_weights_function; NEGEMM _mm_gemm; diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h index 18cb61bf9..e34b4dcb0 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h @@ -43,8 +43,8 @@ public: public: NEFullyConnectedReshapingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr) - : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), - _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) + : _memory_manager{memory_manager}, _input(nullptr), _weights(nullptr), _biases(nullptr), + _output(nullptr), _neon_buffer{}, _neon_fc{nullptr}, _neon_reshape{}, _needs_reshape(false) { // DO NOTHING } diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h index 155a1b837..6944c77f6 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -47,6 +47,7 @@ namespace arm_compute { class ITensor; +class ITensorInfo; /** Basic function to run @ref NEGatherKernelEx */ class NEGatherEx : public INESimpleFunctionNoBorder diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h index 521a05ad9..f6fda60a9 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -48,12 +48,14 @@ #define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/core/Error.h" #include <vector> namespace arm_compute { class ITensor; +class ITensorInfo; /** * @brief Class to perform HashtableLookup operation @@ -96,5 +98,5 @@ public: const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *hits); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h index 18e813923..0ee967698 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -54,6 +54,7 @@ namespace arm_compute { class ITensor; +class ITensorInfo; /** Basic function to perform a Instance normalization. * @@ -112,5 +113,5 @@ private: Tensor _permuted_input; Tensor _permuted_output; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h new file mode 100644 index 000000000..668f024a1 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEONEHOT_H__ +#define __ARM_COMPUTE_NEONEHOT_H__ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +namespace arm_compute +{ +// Forward declarations +class ITensor; +class ITensorInfo; + +/** Basic function to run @ref NEOneHotKernel */ +class NEOneHot : public INESimpleFunctionNoBorder +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the + * following types: U32/S32 + * @param[in] depth The tensor for depth of the one hot dimension. Supported tensor rank: up + * to 3. Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor. Supported tensor rank: only 1. Data type supported: + * U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor. Supported tensor rank: only 1. Data type supported: + * Same as @p on_value + * @param[out] output Destination tensor. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + */ + void configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis = -1); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEOneHotKernel + * + * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] depth The tensor info for depth of the one hot dimension. + * Supported tensor rank: up to 3. + * Must be one of the following types: U32/S32 + * @param[in] on_value On value tensor info. Supported tensor rank: only 1. + * Data type supported: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] off_value Off value tensor info. Supported tensor rank: only 1. + * Data type supported: Same as @p on_value + * @param[out] output Destination tensor info. Data type supported: Same as @p on_value + * @param[in] axis (Optional) The axis to fill. Negative values wrap around. Defaults to -1. + * The value must be in range [-indices.rank , indices.rank) + * + * @return a status + */ + static Status validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis = -1); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEONEHOT_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h index 7f764b000..9858e6c09 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -43,10 +43,10 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/TypesEx.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" +#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -71,7 +71,7 @@ public: * @param[in] op Reduce operation to perform. */ void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output, - ReduceOperation op); + ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref * NEReduceOperation @@ -85,14 +85,14 @@ public: * @return A status */ static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, - bool keep_dims, const ITensorInfo *output, ReduceOperation op); + bool keep_dims, const ITensorInfo *output, ReductionOperation op); // Inherited methods overridden: void run() override; private: MemoryGroup _memory_group; - std::vector<NEReductionOperationEx> _reduction_kernels; + std::vector<NEReductionOperation> _reduction_kernels; std::vector<Tensor> _reduced_outs; NEReshapeLayer _reshape; unsigned int _reduction_ops; diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h index 48b416923..f34a8f8af 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -43,11 +43,13 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" +#include "arm_compute/runtime/Tensor.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h index 24ff5dac9..f82579a45 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -102,47 +102,50 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; /** Allow instances of this class to be moved */ - NETransposeConvLayer(NETransposeConvLayer &&) = default; + NETransposeConvLayer(NETransposeConvLayer &&) = delete; /** Allow instances of this class to be moved */ - NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete; /** Default destructor */ virtual ~NETransposeConvLayer() = default; /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type - * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 - * for F16 input. - * @param[out] output Output tensor. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] invalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. + * Data type supported: Data types supported: S32 for QASYMM8 and + * QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] invalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * */ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom); /** Static function to check if given info will lead to a valid configuration of @ref - * NETransposeConvLayer + * NETransposeConvLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an - * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type - * supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types - * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the @p - * input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is - * decribed in @ref PadStrideInfo. - * @param[in] innvalid_right The number of zeros added to right edge of the output. - * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, + * and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input. + * @param[in] bias (Optional) The biases have one dimension. + * Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, + * F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as + * the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, + * this is decribed in @ref PadStrideInfo. + * @param[in] innvalid_right The number of zeros added to right edge of the output. + * @param[in] invalid_bottom The number of zeros added to bottom edge of the output. * * @return a status */ @@ -168,5 +171,5 @@ private: PadStrideInfo _info; bool _is_prepared; }; -} // arm_compute +} // namespace arm_compute #endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index ba42a2456..e15dc2685 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -54,104 +54,143 @@ using namespace arm_compute; const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { - // ARMComputeEx kernels - {"binary_logical_op", "binary_logical_op.cl"}, - {"embedding_lookup", "embedding_lookup.cl"}, - {"gather_ex", "gather_ex.cl"}, - {"gather_ex_1d", "gather_ex.cl"}, - {"gather_ex_1d_out", "gather_ex.cl"}, - {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, - {"hashtable_lookup", "hashtable_lookup.cl"}, - {"instance_normalization_ex", "instance_normalization_ex.cl"}, - {"multiply_scale_factor", "multiply_scale_factor.cl"}, - {"neg_tensor", "neg_tensor.cl"}, - {"quantization_symm8", "quantization_symm8.cl"}, - {"reduce_min_max", "reduce_operation.cl"}, - {"reduce_sum_mean", "reduce_operation.cl"}, - {"topkv2_init", "topkv2.cl"}, - {"topkv2_find_first_negative", "topkv2.cl"}, - {"topkv2_reorder_negatives", "topkv2.cl"}, - {"topkv2_store", "topkv2.cl"}, - {"radixsort_histogram", "topkv2_radixsort.cl"}, - {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, - {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, - {"radixsort_reorder", "topkv2_radixsort.cl"}, - {"topkv2_quicksort", "topkv2_quicksort.cl"}, - {"scale_factor_symm8", "scale_factor.cl"}, + // ARMComputeEx kernels + {"arg_min_max_ex_x", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_y", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_z", "arg_min_max_ex.cl"}, + {"arg_min_max_ex_w", "arg_min_max_ex.cl"}, + {"binary_logical_op", "binary_logical_op.cl"}, + {"cast_bool", "cast.cl"}, + {"embedding_lookup", "embedding_lookup.cl"}, + {"gather_ex", "gather_ex.cl"}, + {"gather_ex_1d", "gather_ex.cl"}, + {"gather_ex_1d_out", "gather_ex.cl"}, + {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"gemm_accumulate_biases", "gemm.cl"}, + {"hashtable_lookup", "hashtable_lookup.cl"}, + {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"memset", "memset.cl"}, + {"multiply_scale_factor", "multiply_scale_factor.cl"}, + {"neg_tensor", "neg_tensor.cl"}, + {"one_hot", "one_hot.cl"}, + {"one_hot_only_on_value", "one_hot.cl"}, + {"pad_layer_constant", "pad_layer.cl"}, + {"pad_layer_symmetric_reflect", "pad_layer.cl"}, + {"quantization_symm8", "quantization_symm8.cl"}, + {"reduce_min_max", "reduce_operation.cl"}, + {"reduce_sum_mean", "reduce_operation.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, + {"scale_factor_symm8", "scale_factor.cl"}, }; const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS - { - "embedding_lookup.cl", + { + "activation_float_helpers.h", +#include "./cl_kernels/activation_float_helpers.hembed" + }, + { + "arg_min_max_ex.cl", +#include "./cl_kernels/arg_min_max_ex.clembed" + }, + { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "embedding_lookup.cl", #include "./cl_kernels/embedding_lookup.clembed" - }, - { - "gather_ex.cl", + }, + { + "gather_ex.cl", #include "./cl_kernels/gather_ex.clembed" - }, - { - "gemmlowp_ex.cl", + }, + { + "gemmlowp_ex.cl", #include "./cl_kernels/gemmlowp_ex.clembed" - }, - { - "hashtable_lookup.cl", + }, + { + "gemm_helpers.h", +#include "./cl_kernels/gemm_helpers.hembed" + }, + { + "hashtable_lookup.cl", #include "./cl_kernels/hashtable_lookup.clembed" - }, - { - "helpers.h", + }, + { + "helpers.h", #include "./cl_kernels/helpers.hembed" - }, - { - "helpers_asymm.h", + }, + { + "helpers_asymm.h", #include "./cl_kernels/helpers_asymm.hembed" - }, - { - "instance_normalization_ex.cl", + }, + { + "instance_normalization_ex.cl", #include "./cl_kernels/instance_normalization_ex.clembed" - }, - { - "binary_logical_op.cl", -#include "./cl_kernels/binary_logical_op.clembed" - }, - { - "multiply_scale_factor.cl", + }, + { + "gemm.cl", +#include "./cl_kernels/gemm.clembed" + }, + { + "memset.cl", +#include "./cl_kernels/memset.clembed" + }, + { + "multiply_scale_factor.cl", #include "./cl_kernels/multiply_scale_factor.clembed" - }, - { - "neg_tensor.cl", + }, + { + "neg_tensor.cl", #include "./cl_kernels/neg_tensor.clembed" - }, - { - "quantization_symm8.cl", + }, + { + "one_hot.cl", +#include "./cl_kernels/one_hot.clembed" + }, + { + "pad_layer.cl", +#include "./cl_kernels/pad_layer.clembed" + }, + { + "quantization_symm8.cl", #include "./cl_kernels/quantization_symm8.clembed" - }, - { - "reduce_operation.cl", + }, + { + "reduce_operation.cl", #include "./cl_kernels/reduce_operation.clembed" - }, - { - "scale_factor.cl", + }, + { + "repeat.h", +#include "./cl_kernels/repeat.hembed" + }, + { + "scale_factor.cl", #include "./cl_kernels/scale_factor.clembed" - }, - { - "topkv2.cl", + }, + { + "topkv2.cl", #include "./cl_kernels/topkv2.clembed" - }, - { - "topkv2_radixsort.cl", -#include "./cl_kernels/topkv2_radixsort.clembed" - }, - { - "topkv2_quicksort.cl", + }, + { + "topkv2_quicksort.cl", #include "./cl_kernels/topkv2_quicksort.clembed" - }, + }, #endif /* EMBEDDED_KERNELS */ }; CLKernelLibraryEx::CLKernelLibraryEx() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the // CLKernelLibraryEx is built @@ -318,8 +357,8 @@ size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) con size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); ARM_COMPUTE_ERROR_ON_MSG( - err != 0, - "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); ARM_COMPUTE_UNUSED(err); return result; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h new file mode 100644 index 000000000..3c3ff8419 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if GPU_ARCH == GPU_ARCH_BIFROST +#define MLA(a, b, c) (fma(c, b, a)) +#else // GPU_ARCH == GPU_ARCH_BIFROST +#define MLA(a, b, c) ((b) * (c) + (a)) +#endif // GPU_ARCH == GPU_ARCH_BIFROST + +// Hard-Swish +#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \ + (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) + +// Logistic Activation +#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) + +// Hyperbolic Tangent Activation +#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) + +// RELU Tangent Activation +#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) + +// Bounded RELU Activation +#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) + +// Lower Upper Bounded RELU Activation +#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) + +// Leaky RELU Activation +#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \ + ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) + +// Soft RELU Activation +#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) + +// ELU Activation +#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \ + (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0))) + +// Absolute Activation +#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x)) + +// Square Activation +#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x) + +// Square-root Activation +#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x)) + +// Linear Activation +#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) + +// Identity Activation +#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x) + +#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL) + +#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl new file mode 100644 index 000000000..135cacf59 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(FLOAT_DATA_TYPE) +#define ISGREATER(x, y) isgreater(x, y) +#define ISLESS(x, y) isless(x, y) +#else // !FLOAT_DATA_TYPE +#if defined(WIDTH) +#define ISGREATER(x, y) (x > y) ? 1 : 0 +#define ISLESS(x, y) (x < y) ? 1 : 0 +#else // !defined(WIDTH) +#define ISGREATER(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y) +#define ISLESS(x, y) \ + select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y) +#endif // defined(WIDTH) +#endif // defined(FLOAT_DATA_TYPE) + +#if defined(ARG_MAX) +#define CONDITION_TO_USE(x, y) ISGREATER(x, y) +#elif defined(ARG_MIN) +#define CONDITION_TO_USE(x, y) ISLESS(x, y) +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) +#error "Unsupported reduction operation!" +#endif // defined(ARG_MAX) + +#if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) +#if defined(WIDTH) +#if defined(ARG_MIN) +#if defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res)); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index minimum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + res = select(res, x_v, *(input + x_v) < *(input + res)); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 <= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = + (in.s0123 < in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 < in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MIN) +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, + __global const DATA_TYPE_OUTPUT *prev_res, + const int x_idx) +{ + int end_elem = (x_idx + 1) * 16; + if (end_elem > WIDTH) + { + end_elem = WIDTH - x_idx * 16; + } + DATA_TYPE_OUTPUT res = prev_res[0]; + unsigned int res_int = res; + DATA_TYPE_OUTPUT condition_check2; + for (int x_v = 1; x_v < end_elem; ++x_v) + { + int i1 = prev_res[x_v]; + condition_check2 = *(input + i1) > *(input + res_int); + res = select(res, prev_res[x_v], condition_check2); + } + return res; +} +#else // !defined(PREV_OUTPUT) +/** Find index maximum value of a vector + * + * @param[in] input Pointer to the first value. + * + * @return index of the vector. + */ +inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx) +{ +#if WIDTH < 16 + DATA_TYPE_OUTPUT res = 0; + unsigned int i1; + unsigned int i2; + DATA_TYPE_OUTPUT condition_check; + for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v) + { + i1 = x_v; + i2 = res; + condition_check = *(input + i1) > *(input + i2); + res = select(res, x_v, condition_check); + } + return res; +#else // WIDTH >= 16 + int x_elem = x_idx * 16; + const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH); + x_elem -= x_goback; + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, input - x_goback); + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + VEC_DATA_TYPE(DATA_TYPE_SELECT, 8) + idx_sel = (in.s01234567 >= in.s89abcdef); + in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel); + res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8)); + + idx_sel.s0123 = + (in.s0123 > in.s4567) || + (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4))); + in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123); + res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4)); + + idx_sel.s01 = + (in.s01 > in.s23) || + (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2))); + in.s01 = select(in.s23, in.s01, idx_sel.s01); + res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2)); + + idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT)); + res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int)); + + return res.s0 + x_elem; +#endif // WIDTH < 16 +} +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) + +/** This kernel performs parallel reduction given an operation on x-axis. + * + * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed + * using -DPREV_OUTPUT + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the + * ArgMax + * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the + * ArgMin + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] prev_res_ptr (Optional) Pointer to previous results + * tensor. Supported data types: U32/S32 + * @param[in] prev_res_stride_x (Optional) Stride of the output tensor in X + * dimension (in bytes) + * @param[in] prev_res_step_x (Optional) prev_res_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] prev_res_stride_y (Optional) Stride of the output tensor in Y + * dimension (in bytes) + * @param[in] prev_res_step_y (Optional) prev_res_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] prev_res_offset_first_element_in_bytes (Optional) The offset of the first element + * in the previous results tensor + * @param[in] partial_res_ptr The local buffer to hold partial result + * values. Supported data types: U32/S32 + * @param[in] partial_res_stride_x Stride of the output tensor in X dimension + * (in bytes) + * @param[in] partial_res_step_x partial_res_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension + * (in bytes) + * @param[in] partial_res_step_y partial_res_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the + * source tensor + * @param[in] local_results Local buffer for storing the partial result + */ +__kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src), +#if defined(PREV_OUTPUT) + IMAGE_DECLARATION(prev_res), +#endif // defined(PREV_OUTPUT) + IMAGE_DECLARATION(partial_res), + __local DATA_TYPE_OUTPUT *local_results) +{ +#if defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src); + Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res); +#else // !defined(PREV_OUTPUT) + Image src = CONVERT_TO_IMAGE_STRUCT(src); +#endif // defined(PREV_OUTPUT) + Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res); + + unsigned int lsize = get_local_size(0); + unsigned int lid = get_local_id(0); + + const uint x_idx = get_global_id(0); + const uint y_idx = get_global_id(1); + const __global DATA_TYPE *src_in_row = + (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y); + + for (unsigned int y = 0; y < get_local_size(1); ++y) + { +#if defined(ARG_MAX) +#if defined(PREV_OUTPUT) + local_results[lid] = + arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#else // defined(ARG_MIN) +#if defined(PREV_OUTPUT) + local_results[lid] = + arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx); +#else // !defined(PREV_OUTPUT) + local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx); +#endif // defined(PREV_OUTPUT) +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + barrier(CLK_LOCAL_MEM_FENCE); + + // Looking for the next highest power of 2 (maximum value of lsize is 8) + unsigned int middle = lsize - 1; + middle |= middle >> 1; + middle |= middle >> 2; + middle += 1; + // Perform parallel reduction + DATA_TYPE_OUTPUT condition_check3; + for (unsigned int i = middle; i > 0; i >>= 1) + { + if (lid < i && lid + i < lsize) + { + DATA_TYPE tmp0 = *(src_in_row + local_results[lid]); + DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]); +#if defined(ARG_MAX) + condition_check3 = + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1); + local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3); +#else // defined(ARG_MIN) + local_results[lid] = select( + local_results[lid], local_results[lid + i], + ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1)); +#endif // defined(ARG_MAX) || defined(ARG_MIN) + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (lid == 0) + { + ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0]; + } + } +} +#endif // defined(WIDTH) + +#if defined(HEIGHT) +/** This kernel performs reduction on y-axis. + * + * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. + * -DDATA_TYPE=float + * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. + * -DDATA_TYPE_OUTPUT=uint + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output)) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image output = CONVERT_TO_IMAGE_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (unsigned int y = 1; y < HEIGHT; ++y) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = + CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, y, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif // defined(HEIGHT) + +#if defined(DEPTH) +/** This kernel performs reduction on z-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, z, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(DEPTH) */ + +#if defined(BATCH) && defined(DEPTH) +/** This kernel performs reduction on w-axis. + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data type of the select results must be passed at compile time using + * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int + * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 + * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data + * types: S32/F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] input_step_w input_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] output_ptr The local buffer to hold sumed values. Supported + * data types: U32/S32 + * @param[in] output_stride_x Stride of the output tensor in X dimension (in + * bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the output tensor in Y dimension (in + * bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in + * bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in + * bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source + * tensor + */ +__kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) +{ + Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); + + VEC_DATA_TYPE(DATA_TYPE, 16) + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + indx = 0; + for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), + VEC_DATA_TYPE(DATA_TYPE, 16)); + + VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) + cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)); + indx = select(indx, w, cond_conv); + res = select(res, in, CONDITION_TO_USE(in, res)); + } + + // Store result + vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr); +} +#endif /* defined(BATCH) && defined(DEPTH) */ +#endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */ diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl index e249663bc..f8b5bbeb8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/binary_logical_op.cl @@ -111,14 +111,14 @@ __kernel void binary_logical_op(TENSOR3D_DECLARATION(input1), TENSOR3D_DECLARATI #if OP_CODE == 1 // LOGICAL AND VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) && - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); #elif OP_CODE == 2 // LOGICAL OR VSTORE(VEC_SIZE) (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input1.ptr) || - VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input2.ptr), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..3b0a175a4 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function performs a up-scaling depth conversion for boolean type input. + * + * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and + * -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. + * -DVEC_SIZE=16 + * @note The integer shift amount value need to be passed at compile time using -DSHIFT: + * e.g. -DSHIFT=7 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: + * U8 + * @param[in] in_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] in_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] in_step_z in_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data + * types: U8/S8/U16/S16/U32/S32/F16/F32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination + * image + */ +__kernel void cast_bool(TENSOR3D_DECLARATION(in), TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) + in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in.ptr); + + VSTORE(VEC_SIZE) + (CONVERT(in_data & 1, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, + (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl index 92e5dfbee..5ebc78d23 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/embedding_lookup.cl @@ -117,15 +117,15 @@ __kernel void embedding_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION // lookup ids for based on the tensor dimensions int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; in.ptr += input_offset_first_element_in_bytes + lup_id[0] * input_step_x + lup_id[1] * input_step_y + lup_id[2] * input_step_z + lup_id[3] * input_step_w; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl new file mode 100644 index 000000000..9b826a2bd --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl @@ -0,0 +1,7210 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "gemm_helpers.h" +#include "repeat.h" + +#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) +#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1) +#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2) +#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3) +#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7) +#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +#define CONCAT_INC(K0) INC##K0 +#define INC(K0) CONCAT_INC(K0) + +#if (SRC_WIDTH % K0) +#define BOUNDARY_CONDITION_X(x, a) \ + ({ \ + a = select( \ + 0, a, \ + CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \ + VEC_DATA_TYPE(DATA_TYPE, K0))); \ + }) +#else // (SRC_WIDTH % K0) +#define BOUNDARY_CONDITION_X(x, a) ({}) +#endif // (SRC_WIDTH % K0) + +/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks + * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. + * -DSRC_WIDTH=16) + * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. + * -DM0=2, -DK0=2). + * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at + * compile time using -DV0 (e.g. -DV0=2) + * @note Only the following values for M0, K0 and V0 are supported: + * M0: 2,3,4,5,6,7,8 + * K0: 2,3,4,8,16 + * V0: greater than 0 + * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer + * 1x1), the following information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * + * @param[in] src_ptr Pointer to the source LHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_INPUT_AS_3D) + */ +__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +) +{ + // Block size +#define BLOCK_SIZE ((M0) * (K0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (K0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (K0) * (V0) +#else // Do not interleave +#define OUTPUT_STEP_X (K0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + + ((y / (uint)V0) * (uint)dst_stride_y) + + ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)); + + // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0; + REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src_stride_z by DEPTH_GEMM3D + + input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D; + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y); + +#else // defined(REINTERPRET_INPUT_AS_3D) + + input_ptr += z * (uint)src_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + output_ptr += z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + // Load values from the LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); + BOUNDARY_CONDITION_X(x, a0); +#if M0 > 1 + BOUNDARY_CONDITION_X(x, a1); +#endif // M0 > 1 +#if M0 > 2 + BOUNDARY_CONDITION_X(x, a2); +#endif // M0 > 2 +#if M0 > 3 + BOUNDARY_CONDITION_X(x, a3); +#endif // M0 > 3 +#if M0 > 4 + BOUNDARY_CONDITION_X(x, a4); +#endif // M0 > 4 +#if M0 > 5 + BOUNDARY_CONDITION_X(x, a5); +#endif // M0 > 5 +#if M0 > 6 + BOUNDARY_CONDITION_X(x, a6); +#endif // M0 > 6 +#if M0 > 7 + BOUNDARY_CONDITION_X(x, a7); +#endif // M0 > 7 + // ---------------------------Store output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} + +#if M0 == 2 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 3 // M0 == 3 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 4 // M0 == 4 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 5 // M0 == 5 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + DATA_TYPE res1 = a4.s##i; \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \ + }) +#elif M0 == 6 // M0 == 6 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + VSTORE(2) \ + (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \ + }) +#elif M0 == 7 // M0 == 7 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VEC_DATA_TYPE(DATA_TYPE, 3) \ + res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + VSTORE(3) \ + (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \ + }) +#elif M0 == 8 // M0 == 8 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \ + a6.s##i, a7.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#else // M0 not supported +#error "M0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks + * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. + * -DSRC_WIDTH=16) + * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. + * -DM0=2, -DK0=2). + * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at + * compile time using -DV0 (e.g. -DV0=2) + * @note Only the following values for M0, K0 and V0 are supported: + * M0: 2,3,4,5,6,7,8 + * K0: 2,3,4,8,16 + * V0: greater than 0 + * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer + * 1x1), the following information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * + * @param[in] src_ptr Pointer to the source LHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_INPUT_AS_3D) + */ +__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +) +{ + // Block size +#define BLOCK_SIZE ((M0) * (K0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (M0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (M0) * (V0) +#else // Do not interleave +#define OUTPUT_STEP_X (M0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + + ((y / (uint)V0) * (uint)dst_stride_y) + + ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)); + + // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0; + REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src_stride_z by DEPTH_GEMM3D + + input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D; + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y); + +#else // defined(REINTERPRET_INPUT_AS_3D) + + input_ptr += z * (uint)src_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + output_ptr += z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + + // Load values from the LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); + BOUNDARY_CONDITION_X(x, a0); +#if M0 > 1 + BOUNDARY_CONDITION_X(x, a1); +#endif // M0 > 1 +#if M0 > 2 + BOUNDARY_CONDITION_X(x, a2); +#endif // M0 > 2 +#if M0 > 3 + BOUNDARY_CONDITION_X(x, a3); +#endif // M0 > 3 +#if M0 > 4 + BOUNDARY_CONDITION_X(x, a4); +#endif // M0 > 4 +#if M0 > 5 + BOUNDARY_CONDITION_X(x, a5); +#endif // M0 > 5 +#if M0 > 6 + BOUNDARY_CONDITION_X(x, a6); +#endif // M0 > 6 +#if M0 > 7 + BOUNDARY_CONDITION_X(x, a7); +#endif // M0 > 7 + // ---------------------------Transpose and store block ----------------------- + + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1); +#if K0 > 2 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2); +#endif // K0 > 2 +#if K0 > 3 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3); +#endif // K0 > 3 +#if K0 > 4 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7); +#endif // K0 > 4 +#if K0 > 8 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F); +#endif // K0 > 8 + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} +#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) + +#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT) +/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks + * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. + * -DSRC_HEIGHT=16) + * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. + * -DK0=2, -DN0=2). + * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at + * compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * @note Only the following values for K0, N0 and H0 are supported: + * N0: 2,3,4,8,16 + * K0: 1,2,3,4,8,16 + * H0: greater than 0 + * + * @param[in] src_ptr Pointer to the source RHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Block size +#define BLOCK_SIZE ((K0) * (N0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (N0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (N0) * (H0) +#else // Do not interleave +#define OUTPUT_STEP_X (N0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + + z * (uint)src_stride_z; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + + ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, + 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0; + + // Load values from the RHS matrix + a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); +#if K0 > 1 + if (y * (uint)K0 + 1 < SRC_HEIGHT) + { + a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + } +#endif // K0 > 1 +#if K0 > 2 + if (y * (uint)K0 + 2 < SRC_HEIGHT) + { + a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); + } +#endif // K0 > 2 +#if K0 > 3 + if (y * (uint)K0 + 3 < SRC_HEIGHT) + { + a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y)); + } +#endif // K0 > 3 +#if K0 > 4 + if (y * (uint)K0 + 4 < SRC_HEIGHT) + { + a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y)); + } + if (y * (uint)K0 + 5 < SRC_HEIGHT) + { + a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y)); + } + if (y * (uint)K0 + 6 < SRC_HEIGHT) + { + a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y)); + } + if (y * (uint)K0 + 7 < SRC_HEIGHT) + { + a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y)); + } +#endif // K0 > 4 +#if K0 > 8 + if (y * (uint)K0 + 8 < SRC_HEIGHT) + { + a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y)); + } + if (y * (uint)K0 + 9 < SRC_HEIGHT) + { + a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y)); + } + if (y * (uint)K0 + 10 < SRC_HEIGHT) + { + aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y)); + } + if (y * (uint)K0 + 11 < SRC_HEIGHT) + { + aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y)); + } + if (y * (uint)K0 + 12 < SRC_HEIGHT) + { + aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y)); + } + if (y * (uint)K0 + 13 < SRC_HEIGHT) + { + aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y)); + } + if (y * (uint)K0 + 14 < SRC_HEIGHT) + { + aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y)); + } + if (y * (uint)K0 + 15 < SRC_HEIGHT) + { + aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y)); + } +#endif // K0 > 8 + + // ---------------------------Store output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} + +#if defined(TRANSPOSE) +/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks + * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. + * -DSRC_HEIGHT=16) + * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. + * -DK0=2, -DN0=2). + * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at + * compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * @note The option -DTRANSPOSE must passed at compile time. + * @note Only the following values for K0, N0 and H0 are supported: + * N0: 2,3,4,8,16 + * K0: 2,3,4,8,16 + * H0: greater than 0 + * + * @param[in] src_ptr Pointer to the source RHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Block size +#define BLOCK_SIZE ((K0) * (N0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (K0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (K0) * (H0) +#else // Do not interleave +#define OUTPUT_STEP_X (K0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + + z * (uint)src_stride_z; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + + ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0; + + // Load values from the RHS matrix + a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); + if (y * (uint)K0 + 1 < SRC_HEIGHT) + { + a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + } +#if K0 > 2 + if (y * (uint)K0 + 2 < SRC_HEIGHT) + { + a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); + } +#endif // K0 > 2 +#if K0 > 3 + if (y * (uint)K0 + 3 < SRC_HEIGHT) + { + a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y)); + } +#endif // K0 > 3 +#if K0 > 4 + if (y * (uint)K0 + 4 < SRC_HEIGHT) + { + a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y)); + } + if (y * (uint)K0 + 5 < SRC_HEIGHT) + { + a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y)); + } + if (y * (uint)K0 + 6 < SRC_HEIGHT) + { + a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y)); + } + if (y * (uint)K0 + 7 < SRC_HEIGHT) + { + a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y)); + } +#endif // K0 > 4 +#if K0 > 8 + if (y * (uint)K0 + 8 < SRC_HEIGHT) + { + a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y)); + } + if (y * (uint)K0 + 9 < SRC_HEIGHT) + { + a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y)); + } + if (y * (uint)K0 + 10 < SRC_HEIGHT) + { + aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y)); + } + if (y * (uint)K0 + 11 < SRC_HEIGHT) + { + aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y)); + } + if (y * (uint)K0 + 12 < SRC_HEIGHT) + { + aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y)); + } + if (y * (uint)K0 + 13 < SRC_HEIGHT) + { + aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y)); + } + if (y * (uint)K0 + 14 < SRC_HEIGHT) + { + aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y)); + } + if (y * (uint)K0 + 15 < SRC_HEIGHT) + { + aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y)); + } +#endif // K0 > 8 + + // ---------------------------Transpose the block ------------------------------ + REPEAT_VAR_INIT_TO_CONST( + N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, + 0); // VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0; + +#if K0 == 2 + // This part computes the following transpositions: + // 2x2 -> 2x2 + // 2x4 -> 4x2 + // 2x8 -> 8x2 + // 2x16 -> 16x2 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF); +#endif // N0 > 8 + +#elif K0 == 3 // K0 == 2 + // This part computes the following transpositions: + // 3x2 -> 2x3 + // 3x4 -> 4x3 + // 3x8 -> 8x3 + // 3x16 -> 16x3 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF); +#endif // N0 > 8 + +#elif K0 == 4 // K0 == 4 + // This part computes the following transpositions: + // 4x2 -> 2x4 + // 4x4 -> 4x4 + // 4x8 -> 8x4 + // 4x16 -> 16x4 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF); +#endif // N0 > 8 + +#elif K0 == 8 // K0 == 8 + // This part computes the following transpositions: + // 8x2 -> 2x8 + // 8x4 -> 4x8 + // 8x8 -> 8x8 + // 8x16 -> 16x8 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF); +#endif // N0 > 8 + +#elif K0 == 16 // K0 == 16 + + // This part computes the following transpositions: + // 16x2 -> 2x16 + // 16x4 -> 4x16 + // 16x8 -> 8x16 + // 16x16 -> 16x16 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0, + a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1, + a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2, + a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3, + a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4, + a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5, + a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6, + a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7, + a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8, + a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9, + a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA, + a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB, + a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC, + a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD, + a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE, + a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF, + a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF); +#endif // N0 > 8 + +#else // N0 == 16 +#error "Not supported N0 value" +#endif // N0 > 2 + + // ---------------------------Store the output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} +#endif // defined(TRANSPOSE) +#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT) + +#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \ + defined(M) && defined(N) && defined(K) + +#define CONCAT(a, b) a##b + +#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); }) +#define ARM_DOT2(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT2(a, b, c); \ + c = fma((a.s2), (b.s2), c); \ + }) +#define ARM_DOT4(a, b, c) \ + ({ \ + ARM_DOT3(a, b, c); \ + c = fma((a.s3), (b.s3), c); \ + }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) + +#if N0 == 2 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + }) +#elif N0 == 3 // N0 == 3 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + }) +#elif N0 == 4 // N0 == 4 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + }) +#elif N0 == 8 // N0 == 8 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##4), (c.s4)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##5), (c.s5)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##6), (c.s6)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##7), (c.s7)); \ + }) +#elif N0 == 16 // N0 == 16 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##4), (c.s4)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##5), (c.s5)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##6), (c.s6)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##7), (c.s7)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##8), (c.s8)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##9), (c.s9)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##A), (c.sA)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##B), (c.sB)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##C), (c.sC)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##D), (c.sD)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##E), (c.sE)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##F), (c.sF)); \ + }) +#else // N0 not supported +#error "N0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS reshaped matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS reshaped matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(K0, a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(K0, a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(K0, a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(K0, a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(K0, a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(K0, a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(K0, a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(K0, a7, b, c7); +#endif // M0 > 7 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS reshaped matrix + LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(1, a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(1, a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(1, a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(1, a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(1, a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(1, a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(1, a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(1, a7, b, c7); +#endif // M0 > 7 + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#define VFMA(a, b, c) ({ c = fma(a, b, c); }) + +#if M0 == 1 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + }) +#elif M0 == 2 // M0 == 2 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + }) +#elif M0 == 3 // M0 == 3 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + }) +#elif M0 == 4 // M0 == 4 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + }) +#elif M0 == 5 // M0 == 5 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + }) +#elif M0 == 6 // M0 == 6 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + }) +#elif M0 == 7 // M0 == 7 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + }) +#elif M0 == 8 // M0 == 8 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ + }) +#else // M0 not supported +#error "M0 not supported" +#endif // M0 not supported + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90). + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS reshaped matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); // uint zin0=0,zin1=0,zin2=0,... zin7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0; + +#if defined(REINTERPRET_INPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); + + LD_RHS_VFMA_M0xN0(0, a, c); + LD_RHS_VFMA_M0xN0(1, a, c); +#if K0 > 2 + LD_RHS_VFMA_M0xN0(2, a, c); +#endif // K0 > 2 +#if K0 > 3 + LD_RHS_VFMA_M0xN0(3, a, c); +#endif // K0 > 3 +#if K0 > 4 + LD_RHS_VFMA_M0xN0(4, a, c); + LD_RHS_VFMA_M0xN0(5, a, c); + LD_RHS_VFMA_M0xN0(6, a, c); + LD_RHS_VFMA_M0xN0(7, a, c); +#endif // K0 > 4 +#if K0 > 8 + LD_RHS_VFMA_M0xN0(8, a, c); + LD_RHS_VFMA_M0xN0(9, a, c); + LD_RHS_VFMA_M0xN0(A, a, c); + LD_RHS_VFMA_M0xN0(B, a, c); + LD_RHS_VFMA_M0xN0(C, a, c); + LD_RHS_VFMA_M0xN0(D, a, c); + LD_RHS_VFMA_M0xN0(E, a, c); + LD_RHS_VFMA_M0xN0(F, a, c); +#endif // K0 > 8 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); +#if M0 > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); +#endif // M0 > 1 +#if M0 > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); +#endif // M0 > 2 +#if M0 > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); +#endif // M0 > 3 +#if M0 > 4 + VEC_DATA_TYPE(DATA_TYPE, 2) + a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); +#endif // M0 > 4 +#if M0 > 5 + VEC_DATA_TYPE(DATA_TYPE, 2) + a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); +#endif // M0 > 5 +#if M0 > 6 + VEC_DATA_TYPE(DATA_TYPE, 2) + a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); +#endif // M0 > 6 +#if M0 > 7 + VEC_DATA_TYPE(DATA_TYPE, 2) + a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); +#endif // M0 > 7 + + LD_RHS_VFMA_M0xN0(0, a, c); + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && + // defined(M) && defined(N) && defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \ + defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N) + +#if defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + c += a.s8 * b.s8; \ + c += a.s9 * b.s9; \ + c += a.sA * b.sA; \ + c += a.sB * b.sB; \ + c += a.sC * b.sC; \ + c += a.sD * b.sD; \ + c += a.sE * b.sE; \ + c += a.sF * b.sF; \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#else // defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + c = fma(a.s8, b.s8, c); \ + c = fma(a.s9, b.s9, c); \ + c = fma(a.sA, b.sA, c); \ + c = fma(a.sB, b.sB, c); \ + c = fma(a.sC, b.sC, c); \ + c = fma(a.sD, b.sD, c); \ + c = fma(a.sE, b.sE, c); \ + c = fma(a.sF, b.sF, c); \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#endif // defined(MIXED_PRECISION) + +#if N0 == 2 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + }) +#elif N0 == 3 // N0 == 3 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + }) +#elif N0 == 4 // N0 == 4 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + }) +#elif N0 == 8 // N0 == 8 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + }) +#elif N0 == 16 // N0 == 16 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + ARM_DOT_K0((a), (b##8), (c.s8)); \ + ARM_DOT_K0((a), (b##9), (c.s9)); \ + ARM_DOT_K0((a), (b##A), (c.sA)); \ + ARM_DOT_K0((a), (b##B), (c.sB)); \ + ARM_DOT_K0((a), (b##C), (c.sC)); \ + ARM_DOT_K0((a), (b##D), (c.sD)); \ + ARM_DOT_K0((a), (b##E), (c.sE)); \ + ARM_DOT_K0((a), (b##F), (c.sF)); \ + }) +#else // N0 not supported +#error "N0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT + * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 + * must be transposed + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The data type used for the accumulators must be passed at compile time using + * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float) + * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION + * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: F16/F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS + * reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + +#if defined(DUMMY_WORK_ITEMS) + if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + + (get_global_id(1) / V0) * (uint)lhs_stride_y + + (get_global_id(2) * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (get_global_id(0) / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += get_global_id(2) * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + + for (int i = 0; i < k; i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(a7, b, c7); +#endif // M0 > 7 + + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); + rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, + dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += get_global_id(2) * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#if defined(LHS_TRANSPOSE) + +#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) + +#if defined(MIXED_PRECISION) + +#if (GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) \ + c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \ + (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) \ + c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \ + (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#else // defined(MIXED_PRECISION + +#if (GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) c += (a) * (b); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#endif // defined(MIXED_PRECISION) + +#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); }) +#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \ + }) +#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \ + }) +#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \ + }) +#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \ + }) + +// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. +// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output +// matrix Lower case is a vector (a, b) Upper case is a matrix (C) +#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C) + +#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \ + ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); }) +#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \ + }) +#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \ + }) +#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \ + }) +#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \ + }) +#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \ + }) + +// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication. +// The dimensions for this matrix multiplications are defined through M0, N0 and K0 +// The dimensions supported are: +// M0: 1, 2, 3, 4, 8 +// N0: 1, 2, 3, 4, 8, 16 +// K0: 1, 2, 3, 4, 8, 16 +// This macro calls the vector-by-matrix macro K0 times +// A, B and C are matrices +#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \ + CONCAT(ARM_MM_T_NT_M0xN0x, K0) \ + (M0, N0, TYPE, A, B, C) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be + * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 + * must be NOT transposed + * + * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. + * -DLHS_TRANSPOSE). + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: F16/F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS + * reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (M0) +#define LHS_STEP_X ((M0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (M0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); + + __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); + __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); + + for (int i = 0; i < k; i += K0) + { + VEC_DATA_TYPE(DATA_TYPE, M0) + a0 = VLOAD(M0)(0, lhs); + VEC_DATA_TYPE(DATA_TYPE, N0) + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + +#if K0 > 1 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 1 + +#if K0 > 2 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 2 + +#if K0 > 3 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 3 + +#if K0 > 4 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 4 + +#if K0 > 8 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 8 + +#ifndef LHS_INTERLEAVE + lhs += (M0 * K0 * (V0 - 1)); +#endif // LHS_INTERLEAVE + +#ifndef RHS_INTERLEAVE + rhs += (N0 * K0 * (H0 - 1)); +#endif // RHS_INTERLEAVE + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = + bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + + z * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#endif // defined(LHS_TRANSPOSE) + +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && + // defined(DATA_TYPE) + +#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) + +#define VFMA(a, b, c) ({ c = fma(a, b, c); }) + +#if M0 == 1 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); }) +#elif M0 == 2 // M0 == 2 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + }) +#elif M0 == 3 // M0 == 3 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + }) +#elif M0 == 4 // M0 == 4 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + }) +#elif M0 == 5 // M0 == 5 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + }) +#elif M0 == 6 // M0 == 6 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + }) +#elif M0 == 7 // M0 == 7 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + }) +#elif M0 == 8 // M0 == 8 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ + }) +#else // M0 not supported +#error "M0 not supported" +#endif // M0 not supported + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is NOT reshaped + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64) + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., + * -DK0=2) + * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x lhs_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y lhs_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: + * same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes) + * @param[in] rhs_step_x rhs_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes) + * @param[in] rhs_step_y rhs_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero); + + RHS_VFMA_M0xN0(0, a, b0, c); + RHS_VFMA_M0xN0(1, a, b1, c); +#if K0 > 2 + RHS_VFMA_M0xN0(2, a, b2, c); +#endif // K0 > 2 +#if K0 > 3 + RHS_VFMA_M0xN0(3, a, b3, c); +#endif // K0 > 3 +#if K0 > 4 + RHS_VFMA_M0xN0(4, a, b4, c); + RHS_VFMA_M0xN0(5, a, b5, c); + RHS_VFMA_M0xN0(6, a, b6, c); + RHS_VFMA_M0xN0(7, a, b7, c); +#endif // K0 > 4 +#if K0 > 8 + RHS_VFMA_M0xN0(8, a, b8, c); + RHS_VFMA_M0xN0(9, a, b9, c); + RHS_VFMA_M0xN0(A, a, bA, c); + RHS_VFMA_M0xN0(B, a, bB, c); + RHS_VFMA_M0xN0(C, a, bC, c); + RHS_VFMA_M0xN0(D, a, bD, c); + RHS_VFMA_M0xN0(E, a, bE, c); + RHS_VFMA_M0xN0(F, a, bF, c); +#endif // K0 > 8 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += K0 * rhs_stride_y; + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0)); +#if M0 > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1)); +#endif // M0 > 1 +#if M0 > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2)); +#endif // M0 > 2 +#if M0 > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3)); +#endif // M0 > 3 +#if M0 > 4 + VEC_DATA_TYPE(DATA_TYPE, 2) + a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4)); +#endif // M0 > 4 +#if M0 > 5 + VEC_DATA_TYPE(DATA_TYPE, 2) + a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5)); +#endif // M0 > 5 +#if M0 > 6 + VEC_DATA_TYPE(DATA_TYPE, 2) + a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6)); +#endif // M0 > 6 +#if M0 > 7 + VEC_DATA_TYPE(DATA_TYPE, 2) + a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7)); +#endif // M0 > 7 + + VEC_DATA_TYPE(DATA_TYPE, N0) + b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y)); + RHS_VFMA_M0xN0(0, a, b, c); + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += rhs_stride_y; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) + +#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) +/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between + * matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global float *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float4 c0 = 0.0f; + float4 c1 = 0.0f; + float4 c2 = 0.0f; + float4 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x4 block + vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between + * matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float4 c0 = 0.0f; + float4 c1 = 0.0f; + float4 c2 = 0.0f; + float4 c3 = 0.0f; + +#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH)) + + int i = 0; + for (; i <= (int)(COLS_MTX_B - 4); i += 4) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + } + + for (; i < (int)(COLS_MTX_B); ++i) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x4 block + vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +// Undefine local defines +#undef COLS_MTX_B + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and + * matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + half8 c0 = 0.0f; + half8 c1 = 0.0f; + half8 c2 = 0.0f; + half8 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, half, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and + * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable. + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float8 c0 = 0.0f; + float8 c1 = 0.0f; + float8 c2 = 0.0f; + float8 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT)); + b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias_f, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias_f0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + float8 bias_f1 = convert_float8(bias1); + float8 bias_f2 = convert_float8(bias2); + float8 bias_f3 = convert_float8(bias3); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias_f, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias_f); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + half8 c_h0 = convert_half8(c0); + half8 c_h1 = convert_half8(c1); + half8 c_h2 = convert_half8(c2); + half8 c_h3 = convert_half8(c3); + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication + * between matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + half8 c0 = 0.0f; + half8 c1 = 0.0f; + half8 c2 = 0.0f; + half8 c3 = 0.0f; + +#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH)) + + int i = 0; + for (; i <= (int)(COLS_MTX_B - 4); i += 4) + { +#if MULT_INTERLEAVE4X4_HEIGHT == 1 + // Load values from matrix A (interleaved) and matrix B (transposed) + half8 a0 = vload8(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix B (transposed) + b0 = vload8(0, src_addr_b); + + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s4, b0, c0); + c1 = fma((half8)a0.s5, b0, c1); + c2 = fma((half8)a0.s6, b0, c2); + c3 = fma((half8)a0.s7, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload8(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix B (transposed) + b0 = vload8(0, src_addr_b); + + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s4, b0, c0); + c1 = fma((half8)a0.s5, b0, c1); + c2 = fma((half8)a0.s6, b0, c2); + c3 = fma((half8)a0.s7, b0, c3); +#else // MULT_INTERLEAVE4X4_HEIGHT == 1 + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); +#endif // MULT_INTERLEAVE4X4_HEIGHT == 1 + } + + for (; i < (int)(COLS_MTX_B); ++i) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, half, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +// Undefine local defines +#undef COLS_MTX_B + +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) + +#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) + +#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \ + (NUM_ELEMS_PROCESSED_PER_THREAD_Y) +#if defined(DATA_TYPE) +#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X) +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped. + * + * @note This OpenCL kernel works with floating point data types (F16/F32) + * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. + * -DDATA_TYPE=float) + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y + * @note The number of matrix A columns and the optional alpha's value need to be passed at compile + * time using -DCOLS_A and -DALPHA + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16/F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(DATA_TYPE); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE)); + + VECTOR_TYPE acc0 = 0.0f; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VECTOR_TYPE acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VECTOR_TYPE acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VECTOR_TYPE acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); + src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y)) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, + src0_stride_y, zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + VECTOR_TYPE b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1)); + VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( + 0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + // Accumulate + acc0 += b0 * (VECTOR_TYPE)a0.s0; + acc0 += b1 * (VECTOR_TYPE)a0.s1; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * (VECTOR_TYPE)a1.s0; + acc1 += b1 * (VECTOR_TYPE)a1.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * (VECTOR_TYPE)a2.s0; + acc2 += b1 * (VECTOR_TYPE)a2.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * (VECTOR_TYPE)a3.s0; + acc3 += b1 * (VECTOR_TYPE)a3.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y)) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + VECTOR_TYPE b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1)); + + // Accumulate + acc0 += b0 * (VECTOR_TYPE)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * (VECTOR_TYPE)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * (VECTOR_TYPE)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * (VECTOR_TYPE)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, + zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, + src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, + dst_addr, dst_stride_y, zout.s); +} +#endif // defined(DATA_TYPE) + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped + * + * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma + * units. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for matrix B + src_addr.s1 += idx * sizeof(float); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize accumulators + float4 acc0 = 0.0f; + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float4 acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float4 acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float4 acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // A and B src indices get incremented at the same time. + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A and matrix B + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A and matrix B + float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s0, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s0, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s0, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s0, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s0, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s0, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s0, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s0, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s0, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s0, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s0, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s0, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s0, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s0, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s0, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s0, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s1, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s1, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s1, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s1, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s1, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s1, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s1, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s1, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s1, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s1, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s1, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s1, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s1, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s1, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s1, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s1, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s2, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s2, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s2, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s2, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s2, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s2, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s2, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s2, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s2, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s2, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s2, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s2, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s2, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s2, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s2, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s2, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s3, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s3, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s3, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s3, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s3, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s3, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s3, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s3, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s3, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s3, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s3, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s3, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s3, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s3, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s3, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s3, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(float); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0, b0.s0, acc0.s0); + acc0.s1 = fma(a0, b0.s1, acc0.s1); + acc0.s2 = fma(a0, b0.s2, acc0.s2); + acc0.s3 = fma(a0, b0.s3, acc0.s3); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1.s0 = fma(a1, b0.s0, acc1.s0); + acc1.s1 = fma(a1, b0.s1, acc1.s1); + acc1.s2 = fma(a1, b0.s2, acc1.s2); + acc1.s3 = fma(a1, b0.s3, acc1.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2.s0 = fma(a2, b0.s0, acc2.s0); + acc2.s1 = fma(a2, b0.s1, acc2.s1); + acc2.s2 = fma(a2, b0.s2, acc2.s2); + acc2.s3 = fma(a2, b0.s3, acc2.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3.s0 = fma(a3, b0.s0, acc3.s0); + acc3.s1 = fma(a3, b0.s1, acc3.s1); + acc3.s2 = fma(a3, b0.s2, acc3.s2); + acc3.s3 = fma(a3, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float); + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped + * + * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma + * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or + * equal to 1000. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if + * alpha!=1.0f. + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for + // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(float); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize accumulators + float2 acc0 = 0.0f; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float2 acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float2 acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float2 acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // A and B src indices get incremented at the same time. + int i = 0; + for (; i <= ((int)COLS_A - 8); i += 8) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0)); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s0, b0.s0, acc0.s0); + acc0.s0 = fma(a0.s1, b1.s0, acc0.s0); + acc0.s0 = fma(a0.s2, b2.s0, acc0.s0); + acc0.s0 = fma(a0.s3, b3.s0, acc0.s0); + acc0.s0 = fma(a0.s4, b4.s0, acc0.s0); + acc0.s0 = fma(a0.s5, b5.s0, acc0.s0); + acc0.s0 = fma(a0.s6, b6.s0, acc0.s0); + acc0.s0 = fma(a0.s7, b7.s0, acc0.s0); + + acc0.s1 = fma(a0.s0, b0.s1, acc0.s1); + acc0.s1 = fma(a0.s1, b1.s1, acc0.s1); + acc0.s1 = fma(a0.s2, b2.s1, acc0.s1); + acc0.s1 = fma(a0.s3, b3.s1, acc0.s1); + acc0.s1 = fma(a0.s4, b4.s1, acc0.s1); + acc0.s1 = fma(a0.s5, b5.s1, acc0.s1); + acc0.s1 = fma(a0.s6, b6.s1, acc0.s1); + acc0.s1 = fma(a0.s7, b7.s1, acc0.s1); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc1.s0 = fma(a0.s0, b0.s0, acc1.s0); + acc1.s0 = fma(a0.s1, b1.s0, acc1.s0); + acc1.s0 = fma(a0.s2, b2.s0, acc1.s0); + acc1.s0 = fma(a0.s3, b3.s0, acc1.s0); + acc1.s0 = fma(a0.s4, b4.s0, acc1.s0); + acc1.s0 = fma(a0.s5, b5.s0, acc1.s0); + acc1.s0 = fma(a0.s6, b6.s0, acc1.s0); + acc1.s0 = fma(a0.s7, b7.s0, acc1.s0); + + acc1.s1 = fma(a0.s0, b0.s1, acc1.s1); + acc1.s1 = fma(a0.s1, b1.s1, acc1.s1); + acc1.s1 = fma(a0.s2, b2.s1, acc1.s1); + acc1.s1 = fma(a0.s3, b3.s1, acc1.s1); + acc1.s1 = fma(a0.s4, b4.s1, acc1.s1); + acc1.s1 = fma(a0.s5, b5.s1, acc1.s1); + acc1.s1 = fma(a0.s6, b6.s1, acc1.s1); + acc1.s1 = fma(a0.s7, b7.s1, acc1.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc2.s0 = fma(a0.s0, b0.s0, acc2.s0); + acc2.s0 = fma(a0.s1, b1.s0, acc2.s0); + acc2.s0 = fma(a0.s2, b2.s0, acc2.s0); + acc2.s0 = fma(a0.s3, b3.s0, acc2.s0); + acc2.s0 = fma(a0.s4, b4.s0, acc2.s0); + acc2.s0 = fma(a0.s5, b5.s0, acc2.s0); + acc2.s0 = fma(a0.s6, b6.s0, acc2.s0); + acc2.s0 = fma(a0.s7, b7.s0, acc2.s0); + + acc2.s1 = fma(a0.s0, b0.s1, acc2.s1); + acc2.s1 = fma(a0.s1, b1.s1, acc2.s1); + acc2.s1 = fma(a0.s2, b2.s1, acc2.s1); + acc2.s1 = fma(a0.s3, b3.s1, acc2.s1); + acc2.s1 = fma(a0.s4, b4.s1, acc2.s1); + acc2.s1 = fma(a0.s5, b5.s1, acc2.s1); + acc2.s1 = fma(a0.s6, b6.s1, acc2.s1); + acc2.s1 = fma(a0.s7, b7.s1, acc2.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc3.s0 = fma(a0.s0, b0.s0, acc3.s0); + acc3.s0 = fma(a0.s1, b1.s0, acc3.s0); + acc3.s0 = fma(a0.s2, b2.s0, acc3.s0); + acc3.s0 = fma(a0.s3, b3.s0, acc3.s0); + acc3.s0 = fma(a0.s4, b4.s0, acc3.s0); + acc3.s0 = fma(a0.s5, b5.s0, acc3.s0); + acc3.s0 = fma(a0.s6, b6.s0, acc3.s0); + acc3.s0 = fma(a0.s7, b7.s0, acc3.s0); + + acc3.s1 = fma(a0.s0, b0.s1, acc3.s1); + acc3.s1 = fma(a0.s1, b1.s1, acc3.s1); + acc3.s1 = fma(a0.s2, b2.s1, acc3.s1); + acc3.s1 = fma(a0.s3, b3.s1, acc3.s1); + acc3.s1 = fma(a0.s4, b4.s1, acc3.s1); + acc3.s1 = fma(a0.s5, b5.s1, acc3.s1); + acc3.s1 = fma(a0.s6, b6.s1, acc3.s1); + acc3.s1 = fma(a0.s7, b7.s1, acc3.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float) * 8; + } + // float size increment + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0, b0.s0, acc0.s0); + acc0.s1 = fma(a0, b0.s1, acc0.s1); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1.s0 = fma(a1, b0.s0, acc1.s0); + acc1.s1 = fma(a1, b0.s1, acc1.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2.s0 = fma(a2, b0.s0, acc2.s0); + acc2.s1 = fma(a2, b0.s1, acc2.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3.s0 = fma(a3, b0.s0, acc3.s0); + acc3.s1 = fma(a3, b0.s1, acc3.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float); + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)); + + LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not beed reshaped + * + * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating + * the result in a 32 floating point variable. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(half); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + float8 acc0 = 0.0h; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float8 acc1 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float8 acc2 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float8 acc3 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + + // Accumulate + acc0 = fma(b0, (float8)a0.s0, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s0, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s0, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s0, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s1, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s1, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s1, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s1, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s2, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s2, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s2, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s2, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s3, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s3, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s3, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s3, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(half); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + + src_addr += (int2)(sizeof(half), src1_stride_y); + + // Accumulate + acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias_f, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float8 bias_f1 = convert_float8(bias1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float8 bias_f2 = convert_float8(bias2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float8 bias_f3 = convert_float8(bias3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + half8 acc_h0 = convert_half8(acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half8 acc_h1 = convert_half8(acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half8 acc_h2 = convert_half8(acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half8 acc_h3 = convert_half8(acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s); +} + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not beed reshaped + * + * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma + * units. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(half); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + half8 acc0 = 0.0h; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half8 acc1 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half8 acc2 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half8 acc3 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Accumulate + acc0 = fma(b0, (half8)a0.s0, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s0, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s0, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s0, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s1, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s1, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s1, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s1, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s2, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s2, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s2, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s2, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s3, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s3, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s3, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s3, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(half); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + + src_addr += (int2)(sizeof(half), src1_stride_y); + + // Accumulate + acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s); +} +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) + +#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && + // (NUM_ELEMS_PROCESSED_PER_THREAD_Y) + +#if defined(BETA) +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account + * that the second matrix might be weighted by a scalar value beta: + * + * @note The beta's value need to be passed at compile time using -DBETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: + * F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] src_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Load values from A x B + float4 alpha_ab = vload4(0, (__global float *)dst.ptr); + + // Load values from Matrix C + float4 c = vload4(0, (__global float *)src.ptr); + + // Computes alpha * axb + beta * c + float4 out = alpha_ab + (float4)BETA * c; + + // Store final result in axb matrix + vstore4(out, 0, (__global float *)dst.ptr); +} + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account + * that the second matrix might be weighted by a scalar value beta: + * + * @note The beta's value need to be passed at compile time using -DBETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: + * F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] src_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Load values from A x B + half8 alpha_ab = vload8(0, (__global half *)dst.ptr); + + // Load values from Matrix C + half8 c = vload8(0, (__global half *)src.ptr); + + // Computes alpha * axb + beta * c + half8 out = alpha_ab + (half8)BETA * c; + + // Store final result in axb matrix + vstore8(out, 0, (__global half *)dst.ptr); +} +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(BETA) + +#if defined(WIDTH_VECTOR_A) +/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and + * matrix B (src1) used for locally connected layer + * + * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A + * + * @note The input A and matrix B must not be reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * 4; + int idy = get_global_id(1); + + // Compute the address for the vector A and matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, + src1_offset_first_element_in_bytes + src1_stride_z * idy)); + src_addr.s1 += idx * sizeof(float); + + int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float)); + + float4 acc = 0.0f; + + for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); + src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) + { + float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + acc += b0 * (float4)a0.s0; + acc += b1 * (float4)a0.s1; + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y)) + { + float a0 = *((__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + + acc += b0 * (float4)a0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0))); +} +#endif // defined(WIDTH_VECTOR_A) + +/** This kernel accumulates each row with the biases vector. + * + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short. + * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16. + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported + * data type: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] accum_stride_x Stride of the accmulate tensor in X + * dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y + * dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the + * accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as @p + * accum_ptr + * @param[in] biases_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +#if defined(DATA_TYPE) && defined(VECTOR_SIZE) +__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases)) +{ + Image accum = CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); + + // Vector size, e.g. number of vector elements. + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr); + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr); + accum_value = biases_value + accum_value; + // Store result in the accumulate buffer + VSTORE(VECTOR_SIZE) + (accum_value, 0, (__global DATA_TYPE *)accum.ptr); +} +#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h new file mode 100644 index 000000000..0c75d061f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h @@ -0,0 +1,1235 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" + +/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). + * @name LOAD_ROW_n + * + * @param[in] N0 The number of rows to load + * @param[in] DATA_TYPE The data type of variables + * @param[in] BASENAME The basename of the destination variables for the loaded rows + * @param[in] PTR The base pointer + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The z-axis offset vector + * @{ + */ +#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); + +#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); + +#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); + +#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); + +#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); + +#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); + +#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); + +#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); + +#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); + +#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); + +#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); + +#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); + +#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); + +#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); + +#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); + +#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); + +/** @}*/ // end of group LOAD_ROW_n + +/** Load Blocks (consecutive rows and columns) with Z offset. + * @name LOAD_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 + * The data to load is expected to have consecutive names for each row. + * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2. + * + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] DATA_TYPE The data type of the target + * @param[in] BASENAME The basename of the result variables + * @param[in] PTR The base pointer for the data + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride in y-axis direction + * @param[in] Z The z-axis offset vector + * @{ + */ +#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +/** @} */ // end of group LOAD_BLOCK + +/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). + * @name LOAD_ELEMENT_n + * + * @param[in] N0 The number of rows to load + * @param[in] DATA_TYPE The data type of variables + * @param[in] BASENAME The basename of the destination variables for the loaded rows + * @param[in] PTR The base pointer + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride value in y-axis direction + * @{ + */ +#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); + +#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); + +#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); + +#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); + +#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); + +#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); + +#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); + +#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); + +#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); + +#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); + +#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); + +#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); + +#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); + +#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); + +#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); + +#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); + +/** @}*/ // end of group LOAD_ELEMENT_n + +/** Load Scalar as Vector (consecutive elements). + * @name LOAD_SCALAR_AS_VECTOR + * + * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 + * The data to load is expected to have consecutive names for each row. + * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. + * + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] DATA_TYPE The data type of the target + * @param[in] BASENAME The basename of the result variables + * @param[in] PTR The base pointer for the data + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride in y-axis direction + * @{ + */ +#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +/** @} */ // end of group LOAD_SCALAR_AS_VECTOR + +/** Basic macros to calculate Z offset values from Z0 to Zn-1 + * @name CALCULATE_Z_OFFSET_n + * + * @param[in] M0 The number of offset values to calculate + * @param[in] DATA_TYPE The data type of the results + * @param[in] Z The basename of the result variables + * @param[in] Y The work-itme ID of y-axis + * @param[in] HEIGHT_GEMM3D The height of GEMM3D + * @param[in] DEPTH_GEMM3D The depth of GEMM3D + * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension + * @param[in] STRIDE_Y The stride value in y-axis direction + * + * @{ + */ +#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ + Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ + Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ + Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ + Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ + Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ + Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ + Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ + Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); + +/** @} */ // end of group CALCULATE_Z_OFFSET_n + +/** Calculate Z offset values from Z0 to Zn-1 + * @name CALCULATE_Z_OFFSET + * + * The Z offsets are expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3. + * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account + * the possible cross plane paddings in case of the plance changes across the z-dimension. + * + * <!-- + * | | + * | plane0 | + * | | + * |__________________| + * |******************| + * | cross_plane_pad | + * |******************| + * | | + * | plane1 | + * | | + * |__________________| + * --> + * + * @param[in] M0 The number of offset values to calculate + * @param[in] DATA_TYPE The data type of the results + * @param[in] Z The basename of the result variables + * @param[in] Y The work-itme ID of y-axis + * @param[in] HEIGHT_GEMM3D The height of GEMM3D + * @param[in] DEPTH_GEMM3D The depth of GEMM3D + * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension + * @param[in] STRIDE_Y The stride value in y-axis direction + * @{ + */ +#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) +#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) +/** @} */ // end of group CALCULATE_Z_OFFSET + +/** Store the 0 to (n-1)th rows of the given variables + * @name STORE_ROW_n + * + * @param[in] N0 The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); + +#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); + +#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); + +#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); + +#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); + +#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); + +#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); + +#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); + +#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); + +#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); + +#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); + +#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); + +#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); + +#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); + +#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); + +#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); +/** @} */ // end of groupd STORE_ROW_n + +/** Convert and store the 0th to (n-1)th rows of the given variables + * @name CONVERT_STORE_ROW_n + * + * @param[in] N0 The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); + +#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); + +#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); + +#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); + +#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); + +#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); + +#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); + +#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); + +#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); + +#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); + +#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); + +#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); + +#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); + +#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); + +#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); + +#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); + +/** @} */ // end of groupd CONVERT_STORE_ROW_n + +/** Store a block of the given size M0xN0 + * @name STORE_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. + * The data to store is expected to have consecutive names for each row. + * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. + * + * @param[in] M0 The number of rows to store + * @param[in] N0 The size of each vector + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +/** @} */ // end of group STORE_BLOCK + +/** Convert and store a block of the given size M0xN0 + * @name CONVERT_STORE_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. + * The data to store is expected to have consecutive names for each row. + * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. + * + * @param[in] M0 The number of rows to store + * @param[in] N0 The size of each vector + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +/** @} */ // end of group CONVERT_STORE_BLOCK + +/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) + * @name SCALE_ROW_n + * + * @param[in] DATA_TYPE The data type of the variables + * @param[in] BASENAME The basename of the variables + * @param[in] SCALE The scale factor + * @{ + */ +#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##1 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##2 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##3 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##4 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##5 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##6 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##7 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##8 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##9 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##A *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##B *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##C *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##D *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##E *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##F *= (DATA_TYPE)SCALE; +/** @} */ // end of group SCALE_ROW_n + +/** Scale elements stored in a block (BASENAME) + * @name SCALE_BLOCK + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] N The number of rows in the block + * @param[in] DATA_TYPE The data type of the block + * @param[in] BASENAME The basename of the block + * @param[in] SCALE The scale factor + * @{ + */ +#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) +#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) +/** @} */ // end of group SCALE_BLOCK + +/** Create a new vector containing the values at the given index for a set of given vectors + * @name COLUMN_VECTORn + * + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] X The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + * @{ + */ +#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ + TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); +#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 2) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); +#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 3) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); +#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 4) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \ + (X##2).s##IDX_COL, (X##3).s##IDX_COL); +#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))( \ + (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \ + (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); +#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))( \ + (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \ + (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \ + (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \ + (X##F).s##IDX_COL); +/** @} */ // end of group COLUMN_VECTORn + +/** Create a new vector containing the values at the given index. Utility macros for transposing a + * colum-vector + * @name COLUMN_VECTOR_SCALARn + * + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] X The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + * @{ + */ +#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0)); +#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 2) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); +#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 3) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); +#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 4) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); +#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); +#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \ + (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); +/** @} */ // end of group COLUMN_VECTORn + +/** Create transposed vectors of the given vectors + * @name TRANSPOSE_K0Xn + * + * @param[in] K0 The size of the source vectors + * @param[in] BASENAME The basename of transposed vectors + * @param[in] B The basename of source vectors for transposition + * @param[in] TYPE The data type of the transposed vectors + * @{ + */ +#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE); +#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \ + COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE); +#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X2(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE); +#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X3(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE); +#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X4(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE); +#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X8(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, A, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, B, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, C, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, D, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, E, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, F, BASENAME, B, TYPE); + +/** @} */ // end of group TRANSPOSE_K0Xn + +/** Create column vectors to contain the values at the given index for a set of given vectors + * + * @param[in] K0 The number of source vectors + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] B The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + */ +#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \ + CONCAT(COLUMN_VECTOR, K0) \ + (IDX_COL, BASENAME, B, TYPE); + +/** Create column vectors to contain the values at the given index. Utility macro for transposing a + * column-vector + * + * @param[in] K0 The number of source vectors + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] B The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + */ +#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \ + CONCAT(COLUMN_VECTOR_SCALAR, K0) \ + (IDX_COL, BASENAME, B, TYPE); + +/** Create transposed vectors form the given source vectors + * + * @param[in] K0 The size of source vectors + * @param[in] N0 The number of source vectors + * @param[in] BASENAME The basename of transposed vectors + * @param[in] B The basename of source vectors for transposition + * @param[in] TYPE The data type of the transposed vectors + * + */ +#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \ + CONCAT(TRANSPOSE_K0X, N0) \ + (K0, BASENAME, B, TYPE); + +/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1) + * @name ADD_ROW_n + * + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The basename of the added variables + * @{ + */ +#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0; + +#define ADD_ROW_2(BASENAME, BIAS) \ + ADD_ROW_1(BASENAME, BIAS) \ + BASENAME##1 += BIAS##1; + +#define ADD_ROW_3(BASENAME, BIAS) \ + ADD_ROW_2(BASENAME, BIAS) \ + BASENAME##2 += BIAS##2; + +#define ADD_ROW_4(BASENAME, BIAS) \ + ADD_ROW_3(BASENAME, BIAS) \ + BASENAME##3 += BIAS##3; + +#define ADD_ROW_5(BASENAME, BIAS) \ + ADD_ROW_4(BASENAME, BIAS) \ + BASENAME##4 += BIAS##4; + +#define ADD_ROW_6(BASENAME, BIAS) \ + ADD_ROW_5(BASENAME, BIAS) \ + BASENAME##5 += BIAS##5; + +#define ADD_ROW_7(BASENAME, BIAS) \ + ADD_ROW_6(BASENAME, BIAS) \ + BASENAME##6 += BIAS##6; + +#define ADD_ROW_8(BASENAME, BIAS) \ + ADD_ROW_7(BASENAME, BIAS) \ + BASENAME##7 += BIAS##7; + +#define ADD_ROW_9(BASENAME, BIAS) \ + ADD_ROW_8(BASENAME, BIAS) \ + BASENAME##8 += BIAS##8; + +#define ADD_ROW_10(BASENAME, BIAS) \ + ADD_ROW_9(BASENAME, BIAS) \ + BASENAME##9 += BIAS##9; + +#define ADD_ROW_11(BASENAME, BIAS) \ + ADD_ROW_10(BASENAME, BIAS) \ + BASENAME##A += BIAS##A; + +#define ADD_ROW_12(BASENAME, BIAS) \ + ADD_ROW_11(BASENAME, BIAS) \ + BASENAME##B += BIAS##B; + +#define ADD_ROW_13(BASENAME, BIAS) \ + ADD_ROW_12(BASENAME, BIAS) \ + BASENAME##C += BIAS##C; + +#define ADD_ROW_14(BASENAME, BIAS) \ + ADD_ROW_13(BASENAME, BIAS) \ + BASENAME##D += BIAS##D; + +#define ADD_ROW_15(BASENAME, BIAS) \ + ADD_ROW_14(BASENAME, BIAS) \ + BASENAME##E += BIAS##E; + +#define ADD_ROW_16(BASENAME, BIAS) \ + ADD_ROW_15(BASENAME, BIAS) \ + BASENAME##F += BIAS##F; + +/** @} */ // end of group ADD_ROW_n + +/** Add the block (BIAS) to another block (BASENAME) + * @name ADD_BLOCK + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] N The number of vectors in the block + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The basename of the added variables + * @{ + */ +#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) +#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) +/** @} */ // end of group ADD_BLOCK + +/** Broadcast (add single value) to the each element of the destination variables + * @name ADD_ROW_BROADCAST_n + * + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The variable containing the value to add + * @{ + */ +#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS; + +#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ + BASENAME##1 += BIAS; + +#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ + BASENAME##2 += BIAS; + +#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ + BASENAME##3 += BIAS; + +#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ + BASENAME##4 += BIAS; + +#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ + BASENAME##5 += BIAS; + +#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ + BASENAME##6 += BIAS; + +#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ + BASENAME##7 += BIAS; + +#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ + BASENAME##8 += BIAS; + +#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ + BASENAME##9 += BIAS; + +#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ + BASENAME##A += BIAS; + +#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ + BASENAME##B += BIAS; + +#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ + BASENAME##C += BIAS; + +#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ + BASENAME##D += BIAS; + +#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ + BASENAME##E += BIAS; + +#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ + BASENAME##F += BIAS; + +/** Broadcast (add a value) to the each element of the destination block (BASENAME) + * @name ADD_BLOCK_BROADCAST + * + * Supported cases are N=1,2,3,...,16. + * + * @param[in] N The number of vectors in the block + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The variable containing the value to add + * @{ + */ +#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) +#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) +/** @} */ // end of group ADD_BLOCK_BROADCAST + +/** Apply activation to the given variables + * @name ACTIVATION_ROW_n + * + * @param[in] ACTIVATION_TYPE The type of the activation + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] A_VAL Additional value required by the activation + * @param[in] B_VAL Additional value required by the activation + * @{ + */ +#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL); + +#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL); + +#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL); + +#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL); + +#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL); + +#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL); + +#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL); + +#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL); + +#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL); + +#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL); + +#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL); + +#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL); + +#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL); + +#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL); + +#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL); + +#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL); +/** @} */ // end of group ACTIVATION_ROW_n + +/** Apply activation to a block (BASENAME) + * @name ACTIVATION_BLOCK + * + * Supported cases are N=1,2,3,...,16. + * + * @param[in] N The number of vectors in the block + * @param[in] ACTIVATION_TYPE The type of the activation + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] A_VAL Additional value required by the activation + * @param[in] B_VAL Additional value required by the activation + * @{ + */ +#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) +/** @} */ // end of group ACTIVATION_BLOCK + +/** Apply convert_<data_type> to the given variables + * @name CONVERT_ROW_n + * + * @param[in] N The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME_SRC The basename of the source variables + * @param[in] BASENAME_DST The basename of the destination variables + */ +#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); +/** @} */ // end of group CONVERT_ROW_n + +/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST) + * @name CONVERT_BLOCK + * + * Supported cases N=1,2,3,...,16. + * + * @param[in] M The number of vectors to convert + * @param[in] N The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME_SRC The basename of the source variables + * @param[in] BASENAME_DST The basename of the destination variables + */ +#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +/** @} */ // end of group CONVERT_BLOCK diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl new file mode 100644 index 000000000..c19766c9f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl @@ -0,0 +1,2733 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "gemm_helpers.h" +#include "helpers_asymm.h" +#include "repeat.h" + +#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) +#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. + * These macros use the dot8 instruction */ +#define ARM_DOT1(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ + }) +#define ARM_DOT2(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ + }) +#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) + +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] + * without using the dot8 instruction. */ +#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; }) +#define ARM_DOT2(a, b, c) \ + ({ \ + c += (ACC_DATA_TYPE)a.s0 * b.s0; \ + c += (ACC_DATA_TYPE)a.s1 * b.s1; \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT2(a, b, c); \ + c += (ACC_DATA_TYPE)a.s2 * b.s2; \ + }) +#define ARM_DOT4(a, b, c) \ + ({ \ + ARM_DOT3(a, b, c); \ + c += (ACC_DATA_TYPE)a.s3 * b.s3; \ + }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 + * vectors "b" of size K0 [1,16] */ +#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); }) +#define ARM_DOT_K0X2(k0, a, b, c) \ + ({ \ + ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ + ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ + }) +#define ARM_DOT_K0X3(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X2(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ + }) +#define ARM_DOT_K0X4(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X3(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ + }) +#define ARM_DOT_K0X8(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X4(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ + ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ + ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ + ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ + }) +#define ARM_DOT_K0X16(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X8(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ + ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ + ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ + ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ + ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ + ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ + ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ + ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ + }) + +/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */ +#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); }) +#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X1(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ + }) +#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X2(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ + }) +#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X3(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ + }) +#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X4(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ + }) +#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X5(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ + }) +#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X6(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ + }) +#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X7(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ + }) + +#define ARM_DOT_K0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b), (c)); \ + }) + +#define ARM_DOT_K0XN0(n0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT_K0X, n0) \ + (k0, (a), b, (c)); \ + }) + +#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MM_K0XN0X, m0) \ + (n0, k0, a, b, c); \ + }) + +/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 + * vectors "b" of size K0 [1,16] */ +#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; }) +#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ + c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ + }) +#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ + }) +#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ + }) +#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ + c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ + c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ + c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ + }) +#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ + c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ + c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ + c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ + c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ + c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ + c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ + c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ + }) +/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */ +#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); }) +#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ + }) +#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ + }) +#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ + }) +#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ + }) +#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ + }) +#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ + }) +#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ + }) +#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MUL_N0X, k0) \ + (VECTOR_ACC_TYPE, (a), b, (c)); \ + }) +#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ + (VECTOR_ACC_TYPE, k0, a, b, c); \ + }) + +#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \ + defined(N) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices with + * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref + * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped + * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (i.e. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8/QASYMM_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + + (z * lhs_stride_z); + + // Compute RHS matrix address + __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + for (int i = 0; i < k; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + // Update address + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP); + rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is + * transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in + * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + for (int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + lhs_offset += K0; + rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage + * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref + * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be + * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed + * at compile time. + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. + * Supported data type: QASYMM8/QASYMM8_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in + * X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in + * Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in + * the LHS reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. + * Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in + * X dimension (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in + * Y dimension (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in + * the RHS reshaped matrix + * @param[out] dst_ptr Pointer to the destination matrix + * Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in + * X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in + * Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in + * Z dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in + * Z dimension (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS + * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the + * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: S32 + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: S32 + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint( + IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + for (int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + lhs_offset += K0; + rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + // Result of MM is of type DATA_TYPE + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert result of matrix multiplication to S32 + REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int); + + int batch_id = z; +#if defined(DEPTH_GEMM3D) + batch_id /= (int)DEPTH_GEMM3D; +#endif // defined(DEPTH_GEMM3D) + + // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) + K_OFFSET; + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET); + +#if defined(A_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_col_addr = + sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + +#if defined(SUM_COL_HAS_BATCHES) + sum_col_addr += z * sum_col_stride_y; +#endif // defined(SUM_COL_HAS_BATCHES) + VEC_DATA_TYPE(int, N0) + a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr); + a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET; + + REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32); +#endif // defined(A_OFFSET) + +#if defined(B_OFFSET) + // Compute the offset contribution due to B_OFFSET + __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + + (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y; + +#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D) + sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int); +#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D) + LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x); + + REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET); +#endif // defined(B_OFFSET) + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = + biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + + VEC_DATA_TYPE(int, N0) + bias_values = VLOAD(N0)(0, (__global int *)bias_addr); + REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values); +#endif // defined(ADD_BIAS) + + REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_); + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = result_multipliers_ptr + + result_multipliers_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + + VEC_DATA_TYPE(int, N0) + res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr); + VEC_DATA_TYPE(int, N0) + res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr); + + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, + RESULT_SHIFT); +#else // RESULT_SHIFT >= 0 + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, + RESULT_SHIFT); +#endif // RESULT_SHIFT < 0 + +#endif // defined(PER_CHANNEL_QUANTIZATION) + + // Add the offset terms to GEMM's result + REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET); + +#if defined(MIN_BOUND) + REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Convert and store output block (does convert saturate) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) +#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && + // defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(K) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is NOT reshaped + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2) + * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., + * -DK0=2) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in + * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z, + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + + for (; i <= (K - K0); i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + + // Partial matrix multiplication M0,N0,K0 +#if (GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD + // Transpose the values from RHS matrix + TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); + + ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + + // Update the offset + lhs_offset += K0; + rhs_offset += K0 * rhs_stride_y; + } + + // Left-over for loop + for (; i < K; ++i) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + + // Partial matrix multiplication M0,N0,1 +#if (GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD + // Transpose the values from RHS matrix + TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); + + ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + + // Update the offset + lhs_offset += 1; + rhs_offset += rhs_stride_y; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) + +#if defined(COLS_A) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix + * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at + * compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) + sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0; + ACC_DATA_TYPE sum_row = 0; + + __global const DATA_TYPE *matrix_a = + (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + + get_global_id(1) * src_stride_z); + + int i = 0; + + // This for loop performs 16 accumulations + for (; i <= ((int)COLS_A - 16); i += 16) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i); + + sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)); + } + + // This for loop performs the leftover accumulations + for (; i < COLS_A; ++i) + { + sum_row += (ACC_DATA_TYPE)matrix_a[i]; + } + + sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3; + +#if defined(SCALAR) + sum_row *= (int)SCALAR; +#endif // defined(SCALAR) + *((__global int *)dst.ptr) = (int)sum_row; +} + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A + * using the arm dot product instruction. It is also possible to multiply each reduced row by a + * scalar value, if SCALAR is passed at compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + ACC_DATA_TYPE sum_row = 0; + + __global const DATA_TYPE *matrix_a = + (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + + get_global_id(1) * src_stride_z); + + int i = 0; + + // This for loop performs 16 accumulations + for (; i <= ((int)COLS_A - 32); i += 32) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + a0 = vload16(0, matrix_a + i); + + sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + + a0 = vload16(1, matrix_a + i); + + sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + } + + // This for loop performs the leftover accumulations + for (; i < COLS_A; ++i) + { + sum_row += (ACC_DATA_TYPE)matrix_a[i]; + } + +#if defined(SCALAR) + sum_row *= (int)SCALAR; +#endif // defined(SCALAR) + *((__global int *)dst.ptr) = (int)sum_row; +} +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#endif // defined(COLS_A) + +#if defined(COLS_B) && defined(ROWS_B) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of + * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is + * passed at compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix B columns and rows needs to be passed at compile time using + * -DCOLS_B and -DROWS_B + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 16) + sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0; + + __global const DATA_TYPE *matrix_b = + (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z); + + int i = 0; + // This for loop performs 4 accumulations + for (; i <= ((int)ROWS_B - 4); i += 4) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y); + + sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)); + + matrix_b += 4 * src_stride_y; + } + + // This for loop perfoms the leftover accumulations + for (; i < (int)ROWS_B; ++i) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b); + + sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)); + + matrix_b += src_stride_y; + } + +#if defined(SCALAR) + sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR; +#endif // defined(SCALAR) + VSTORE(16) + (convert_int16(sum_col_32), 0, (__global int *)dst.ptr); +} +#endif // defined(COLS_B) && defined(ROWS_B) + +#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE) + +#if defined(K_OFFSET) + +/* Helper function used to calculate the offset contribution after matrix multiplication. + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), + * and calculates the offset contribution of matrix A and matrix B. + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * @param[in] x get_global_id(0) * 4 + * @param[in] y get_global_id(1) + * @param[in] z get_global_id(2) + * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + */ +inline int4 offset_contribution(int x, int y, int z +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS) +) +{ + int4 a_offset_s32 = (int4)0; + int4 b_offset_s32 = (int4)0; + + int batch_id = z; +#if defined(DEPTH_INPUT3D) + batch_id /= (int)DEPTH_INPUT3D; +#endif // defined(DEPTH_INPUT3D) + +#if defined(A_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_col_addr = + sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int); + + // Compute the offset contribution due to A_OFFSET +#if defined(SUM_COL_HAS_BATCHES) + a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y)); +#else // defined(SUM_COL_HAS_BATCHES) + a_offset_s32 = vload4(0, (__global int *)sum_col_addr); +#endif // defined(SUM_COL_HAS_BATCHES) + + a_offset_s32 *= (int4)A_OFFSET; +#endif // defined(A_OFFSET) + +#if defined(B_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_row_addr = + sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int); + + // Compute the offset contribution due to B_OFFSET +#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D); +#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y))); +#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 *= (int4)B_OFFSET; +#endif // defined(B_OFFSET) + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + b_offset_s32 += (int4)biases_values; +#endif // defined(ADD_BIAS) + + return (int4)K_OFFSET + a_offset_s32 + b_offset_s32; +} + +/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is + * performed in-place + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * @param[in] mm_result_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + */ +__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS)) +) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // Store the result with the offset contribution + vstore4(in_s32, 0, (__global int *)mm_result_addr); +} + +#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \ + defined(OUTPUT_DATA_TYPE) +/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and + * it quantizes down to uint8. + * + * This kernel takes a final int32 accumulator value (the output of + * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and + * quantizes to uint8 through the output stage. + * + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The result before the output stage is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * This result is quantized down to uint8/int8 using the output stage. The output stage computes the + * following operations: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result (if -DADD_BIAS is passed at compile time) + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND + * are passed at compile time) + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] mm_result_ptr Pointer to the source tensor. + * Supported data type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of + * elements along Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in + * the source tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor + * Supported data type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in + * X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] dst_step_z src_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) + , +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), + VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) +) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // -------------- OUTPUT STAGE + + // Add the offset terms to GEMM's result + in_s32 += (int4)RESULT_OFFSET; + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = + result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); + int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); + int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); + + in_s32 *= result_multipliers_values; + in_s32 >>= result_shifts_values; +#else // defined(PER_CHANNEL_QUANTIZATION) + in_s32 *= RESULT_MULTIPLIER; + + in_s32 >>= RESULT_SHIFT; +#endif // defined(PER_CHANNEL_QUANTIZATION) + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} + +/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes + * down to uint8. + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to + * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output + * stage. + * + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The result before the output stage is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * This result is quantized down to uint8/int8 using the output stage. The output stage computes the + * following operations: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] mm_result_ptr Pointer to the source tensor. + * Supported data type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of + * elements along Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in + * the source tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor + * Supported data type: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in + * X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] dst_step_z src_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void +gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) + , +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), + VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) +) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // -------------- OUTPUT STAGE + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = + result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); + int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); + int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); + + int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + in_s32, result_multipliers_values, result_shifts_values, 4); + int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + in_s32, result_multipliers_values, result_shifts_values, 4); + in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + in_s32 = + ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + +#endif // defined(PER_CHANNEL_QUANTIZATION) + + // Add the offset terms to GEMM's result + in_s32 += (int4)RESULT_OFFSET; + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && + // defined(OUTPUT_DATA_TYPE) + +#endif // defined(K_OFFSET) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value and processes it to obtain the final + * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result (if -DADD_BIAS is passed at compile time) + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND + * are passed at compile time) + * -# Clamp the resulting int32 values: + * -# - to the [0..255] range and cast to QASYMM8. + * -# - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Add the offset terms to GEMM's result + input_values += (int4)RESULT_OFFSET; + + // Multiply by result_mult_int and shift + input_values *= RESULT_MULT_INT; + +#if RESULT_SHIFT < 0 + input_values >>= -RESULT_SHIFT; +#else // RESULT_SHIFT >= 0 + input_values >>= RESULT_SHIFT; +#endif // RESULT_SHIFT < 0 + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) + +#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \ + defined(RESULT_SHIFT) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be + * performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER + * and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + + // Add the offset terms to GEMM's result + input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && + // defined(RESULT_SHIFT) + +#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) + +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QSYMM16 value. The following computations will be performed by + * the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QSYMM16 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + + short4 res = convert_short4_sat(input_values); + +#if defined(MIN_BOUND) + res = max(res, (short4)MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (short4)MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global short *)dst_addr); +} +#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) + +#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be + * performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Requantize + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset and scalar scale factor must be passed at compile time using + * -DRESULT_OFFSET, -DREAL_MULTIPLIER + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr Pointer to the biases tensor. Supported data + * type: same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in + * bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] dst_step_w src_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) +#if defined(DST_HEIGHT) + TENSOR4D_DECLARATION(dst)) +#else // defined(DST_HEIGHT) + TENSOR3D_DECLARATION(dst)) +#endif // defined(DST_HEIGHT) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Convert to float + float4 input_values_f = convert_float4(input_values); + input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl index 80ba73d1d..85fc09de4 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && \ - defined(COLS_A) + defined(COLS_A) #define VECTOR_CHAR VEC_DATA_TYPE(char, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X) #define VECTOR_FLOAT VEC_DATA_TYPE(float, NUM_ELEMS_PROCESSED_PER_THREAD_X) @@ -117,7 +117,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( , uint dst_cross_plane_pad #endif // REINTERPRET_OUTPUT_AS_3D - ) +) { int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; @@ -208,9 +208,9 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); VECTOR_CHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( - 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); + 0, (__global char *)(src1_ptr + src_addr.s1 + src1_stride_y)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0.s0; @@ -251,7 +251,7 @@ __kernel void gemmlowp_mm_midgard_ex(IMAGE_DECLARATION(src0), IMAGE_DECLARATION( #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 // Load values from matrix B VECTOR_CHAR b0 = - VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global char *)(src1_ptr + src_addr.s1)); // Accumulate acc0 += CONVERT(b0, VECTOR_INT) * (VECTOR_INT)a0; diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl index a4f7dbd48..3ace1fde8 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/hashtable_lookup.cl @@ -115,15 +115,15 @@ __kernel void hashtable_lookup(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION int lup_id[4] = {0}; - lup_id[0] = (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) - : get_global_id(0); - lup_id[1] = (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) - : get_global_id(1); + lup_id[0] = + (NUM_DIMS == 1) ? *((__global int *)vector_offset(&lups, get_global_id(0))) : get_global_id(0); + lup_id[1] = + (NUM_DIMS == 2) ? *((__global int *)vector_offset(&lups, get_global_id(1))) : get_global_id(1); lup_id[2] = (NUM_DIMS == 3) ? *((__global int *)vector_offset(&lups, get_global_id(2))) : get_global_id(2) % DEPTH_OUT; lup_id[3] = (NUM_DIMS == 4) - ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) - : get_global_id(2) / DEPTH_OUT; + ? *((__global int *)vector_offset(&lups, get_global_id(2) / DEPTH_OUT)) + : get_global_id(2) / DEPTH_OUT; if (lup_id[NUM_DIMS - 1] < 0) { diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h index e07a25ec9..4a3bc1369 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -49,7 +49,7 @@ #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ - defined(cl_arm_integer_dot_product_accumulate_int8) + defined(cl_arm_integer_dot_product_accumulate_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && // defined(cl_arm_integer_dot_product_accumulate_int8) @@ -288,21 +288,21 @@ #define VECTOR_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ - uint name##_offset_first_element_in_bytes + uint name##_offset_first_element_in_bytes #define IMAGE_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_offset_first_element_in_bytes #define TENSOR3D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ - uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes #define TENSOR4D_DECLARATION(name) \ __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ - uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ - uint name##_step_w, uint name##_offset_first_element_in_bytes + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes #define CONVERT_TO_VECTOR_STRUCT(name) \ update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ @@ -406,9 +406,9 @@ inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_ uint stride_x, uint step_x) { Vector vector = { - .ptr = ptr, - .offset_first_element_in_bytes = offset_first_element_in_bytes, - .stride_x = stride_x, + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, }; vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; return vector; @@ -436,7 +436,7 @@ inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el .stride_x = stride_x, .stride_y = stride_y}; img.ptr += - img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; return img; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h index 5f1b3f902..d7f1d0814 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -100,16 +100,16 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return quantized values */ -#define QUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(type, size) \ - quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ - { \ - VEC_DATA_TYPE(float, size) \ - out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ - VEC_DATA_TYPE(type, size) \ - res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), \ - VEC_DATA_TYPE(type, size)); \ - return res; \ +#define QUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(type, size) \ + quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ + { \ + VEC_DATA_TYPE(float, size) \ + out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ + VEC_DATA_TYPE(type, size) \ + res = \ + CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ + return res; \ } /** Dequantize a vector of values to floating-point @@ -119,11 +119,11 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return dequantized values in floating point */ -#define DEQUANTIZE_IMPL(type, size) \ - inline VEC_DATA_TYPE(float, size) \ - dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ - { \ - return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ +#define DEQUANTIZE_IMPL(type, size) \ + inline VEC_DATA_TYPE(float, size) \ + dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ + { \ + return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ } /** Correctly-rounded-to-nearest division by a power-of-two. @@ -134,7 +134,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \ - VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ + VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ { \ const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0; \ const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1; \ @@ -152,32 +152,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Product of two fixed-point numbers. */ -#define ASYMM_MULT_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(int, size) \ - overflow = a == b && a == INT_MIN; \ - VEC_DATA_TYPE(long, size) \ - a_64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b_64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - ab_64 = a_64 * b_64; \ - /* Revert COMPMID-907 */ \ - VEC_DATA_TYPE(long, size) \ - mask1 = 1 << 30; \ - VEC_DATA_TYPE(long, size) \ - mask2 = 1 - (1 << 30); \ - VEC_DATA_TYPE(long, size) \ - is_positive_or_zero = ab_64 >= 0; \ - VEC_DATA_TYPE(long, size) \ - nudge = select(mask2, mask1, is_positive_or_zero); \ - VEC_DATA_TYPE(long, size) \ - mask = 1ll << 31; \ - VEC_DATA_TYPE(int, size) \ - ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ - return select(ab_x2_high32, INT_MAX, overflow); \ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* Revert COMPMID-907 */ \ + VEC_DATA_TYPE(long, size) \ + mask1 = 1 << 30; \ + VEC_DATA_TYPE(long, size) \ + mask2 = 1 - (1 << 30); \ + VEC_DATA_TYPE(long, size) \ + is_positive_or_zero = ab_64 >= 0; \ + VEC_DATA_TYPE(long, size) \ + nudge = select(mask2, mask1, is_positive_or_zero); \ + VEC_DATA_TYPE(long, size) \ + mask = 1ll << 31; \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ + return select(ab_x2_high32, INT_MAX, overflow); \ } /** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). @@ -186,32 +186,32 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Result in fixed-point format Q0. */ -#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ - a) \ - { \ - const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ - const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ - const int k_fractional_bits = 31; \ - VEC_DATA_TYPE(int, size) \ - x = a + (1 << (k_fractional_bits - 3)); \ - VEC_DATA_TYPE(int, size) \ - x2 = ASYMM_MULT(x, x, size); \ - VEC_DATA_TYPE(int, size) \ - x3 = ASYMM_MULT(x2, x, size); \ - VEC_DATA_TYPE(int, size) \ - x4 = ASYMM_MULT(x2, x2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2 = \ - ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ - VEC_DATA_TYPE(int, size) \ - x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ - ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ - return constant_term + \ - ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ } /** Each bit of the result is set to the corresponding bit of either then_val or @@ -263,15 +263,15 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define EXP_BARREL_SHIFTER_IMPL(size) \ inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ - VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ - int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ { \ if (k_integer_bits > exponent) \ { \ const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ return ASYMM_SELECT_USING_MASK( \ - ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ - ASYMM_MULT(result, fp_multiplier, size), result, size); \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ } \ \ return result; \ @@ -285,7 +285,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ { \ const int k_fractional_bits = 31 - k_integer_bits; \ VEC_DATA_TYPE(int, size) \ @@ -298,7 +298,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ VEC_DATA_TYPE(int, size) \ result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ - a_mod_quarter_minus_one_quarter_scaled, size); \ + a_mod_quarter_minus_one_quarter_scaled, size); \ VEC_DATA_TYPE(int, size) \ remainder = a_mod_quarter_minus_one_quarter - a; \ \ @@ -312,10 +312,10 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) remainder, size); \ result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ remainder, size); \ - result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ - size); \ result = \ - EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ \ if (k_integer_bits > 5) \ { \ @@ -335,27 +335,27 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return Arithmetic left or right shift. */ -#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ - { \ - if (exponent < 0) \ - { \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ - } \ - \ - const VEC_DATA_TYPE(int, size) min = INT_MIN; \ - const VEC_DATA_TYPE(int, size) max = INT_MAX; \ - int threshold = ((1 << (31 - exponent)) - 1); \ - VEC_DATA_TYPE(int, size) \ - positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ - VEC_DATA_TYPE(int, size) \ - negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ - VEC_DATA_TYPE(int, size) \ - result = x << exponent; \ - result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ - result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ - return result; \ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ } /** Calculates (a+b)/2, rounded to the nearest integer. @@ -365,21 +365,21 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) * * @return (a+b)/2, rounded to the nearest integer. */ -#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ - { \ - VEC_DATA_TYPE(long, size) \ - a64 = convert_long##size(a); \ - VEC_DATA_TYPE(long, size) \ - b64 = convert_long##size(b); \ - VEC_DATA_TYPE(long, size) \ - sum = a64 + b64; \ - const VEC_DATA_TYPE(long, size) one = 1; \ - const VEC_DATA_TYPE(long, size) minus_one = -1; \ - VEC_DATA_TYPE(long, size) \ - sign = select(minus_one, one, sum >= 0); \ - return convert_int##size((sum + sign) / 2); \ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ } /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). @@ -390,7 +390,7 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) */ #define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ inline VEC_DATA_TYPE(int, size) \ - asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ { \ const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ @@ -462,14 +462,14 @@ inline float dequantize_qasymm8_signed(char input, float offset, float scale) #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ asymm_rescale##size(value, src_integer_bits, dst_integer_bits) -#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ - inline VEC_DATA_TYPE(int, size) \ - multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ - { \ - const int left_shift = shift > 0 ? shift : 0; \ - const int right_shift = shift > 0 ? 0 : -shift; \ - return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ - right_shift, size); \ +#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ + { \ + const int left_shift = shift > 0 ? shift : 0; \ + const int right_shift = shift > 0 ? 0 : -shift; \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), \ + right_shift, size); \ } #define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \ multiply_by_quantized_multiplier##size(input, qmul, shift) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl index 014842680..96a243110 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/instance_normalization_ex.cl @@ -41,7 +41,7 @@ #include "helpers.h" #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(DIM_X) && \ - defined(DIM_Y) && defined(DIM_Z) + defined(DIM_Y) && defined(DIM_Z) /** This function normalizes the input 2D tensor across the first dimension with respect to mean and * standard deviation of the same dimension. * @@ -108,14 +108,14 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output) #endif /* IN_PLACE */ #ifdef GAMMA - , + , VECTOR_DECLARATION(gamma) #endif // GAMMA #ifdef BETA - , + , VECTOR_DECLARATION(beta) #endif // BETA - ) +) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); #ifndef IN_PLACE @@ -213,12 +213,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (int i_h = 0; i_h < DIM_Z; ++i_h) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } @@ -231,12 +231,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) @@ -251,12 +251,12 @@ __kernel void instance_normalization_ex(TENSOR4D_DECLARATION(input), for (; x < DIM_X; ++x) { __global DATA_TYPE *input_address = - (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); #ifdef IN_PLACE __global DATA_TYPE *output_address = input_address; #else /* !IN_PLACE */ __global DATA_TYPE *output_address = - (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); + (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); #endif /* IN_PLACE */ *(output_address) = (*(input_address)-mean) * multip + beta; } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl new file mode 100644 index 000000000..51919c8a5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants + +/** Fill the tensor's planes with all value + * @attention The following variables must be passed at compile time: + * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes + * -# -DVEC_SIZE = Vector size + * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might + * need to step back a bit) + * + * @param[in] tensor_ptr Pointer to the source image. Data types + * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] tensor_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] tensor_step_x tensor_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] tensor_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] tensor_step_y tensor_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] value The value used to fill the pages of the tensor + */ +__kernel void memset(TENSOR3D_DECLARATION(tensor)) +{ + Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor); + +#if defined(VEC_SIZE) + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x; +#endif // defined(LAST_ACCESSED_X) + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = (DATA_TYPE)(CONSTANT_VALUE); + + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)tensor.ptr); +#else // !defined(VEC_SIZE) + *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE); +#endif // defined(VEC_SIZE) +} + +#endif // Check for compile time constants diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl index 3943fc4c2..abbfbd275 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/multiply_scale_factor.cl @@ -114,8 +114,8 @@ __kernel void multiply_scale_factor(IMAGE_DECLARATION(input), VECTOR_DECLARATION (val, 0, (__global DATA_TYPE *)output.ptr); #else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE *)(output.ptr)) = - ((DATA_TYPE)(*((__global int *)(input.ptr)))) * - *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); + ((DATA_TYPE)(*((__global int *)(input.ptr)))) * + *(((__global DATA_TYPE *)(scale_ptr)) + get_global_id(1)) * (DATA_TYPE)(multiplier); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl new file mode 100644 index 000000000..784a8d6aa --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/one_hot.cl @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) + +/** Performs the OneHot operation along the chosen axis + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[in] off_value_ptr Pointer to the off_value vector. Supported + * data types: Same as @p on_value. + * @param[in] off_value_stride_x Stride of the off_value vector in X + * dimension (in bytes) + * @param[in] off_value_step_x off_value_stride_x * number of elements + * along X processed per work item (in bytes) + * @param[in] off_value_offset_first_element_in_bytes Offset of the first element in the off_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + VECTOR_DECLARATION(off_value), TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2) % OUTPUT_DIM_Z; + const int pw = get_global_id(2) / OUTPUT_DIM_Z; + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); + +#if AXIS == 0 + const int index = *(__global const int *)tensor3D_offset(&indices, py, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == px ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, pz, pw); + *(__global DATA_TYPE *)output.ptr = index == py ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pw); + *(__global DATA_TYPE *)output.ptr = index == pz ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#elif AXIS == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + *(__global DATA_TYPE *)output.ptr = index == pw ? *((__global const DATA_TYPE *)on_value_ptr) + : *((__global const DATA_TYPE *)off_value_ptr); +#endif // AXIS +} + +/** Performs the OneHot operation along the chosen axis as off_value being zero + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1 + * @attention Output tensor depth should be given as a preprocessor argument using + * -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16 + * @attention Input tensor depth should be given as a preprocessor argument using + * -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16 + * + * + * @param[in] indices_ptr Pointer to the source tensor. Supported data + * types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension + * (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along + * Y processed per work item (in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Y dimension + * (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along + * Z processed per work item (in bytes) + * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the source + * tensor + * @param[in] on_value_ptr Pointer to the on_value vector. Supported + * data types: U8/S8/U16/S16/F16/U32/S32/F32. + * @param[in] on_value_stride_x Stride of the on_value vector in X dimension + * (in bytes) + * @param[in] on_value_step_x on_value_stride_x * number of elements along + * X processed per work item (in bytes) + * @param[in] on_value_offset_first_element_in_bytes Offset of the first element in the on_value + * vector + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p on_value + * @param[in] output_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per work item (in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y + * dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per work item (in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z + * dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per work item (in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W + * dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W + * processed per work item (in bytes) + * @param[in] output_offset_first_element_in_bytes Offset of the first element in the + * destination tensor + */ +__kernel void one_hot_only_on_value(TENSOR3D_DECLARATION(indices), VECTOR_DECLARATION(on_value), + TENSOR4D_DECLARATION(output)) +{ + const int px = get_global_id(0); + const int py = get_global_id(1); + const int pz = get_global_id(2); + + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); + const Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, OUTPUT_DIM_Z); + + const int index = *(__global const int *)tensor3D_offset(&indices, px, py, pz); + + if (index < 0 || index >= DEPTH) + return; + +#if AXIS == 0 + *(__global DATA_TYPE *)tensor4D_offset(&output, index, px, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 1 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, index, py, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 2 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, index, pz) = + *((__global const DATA_TYPE *)on_value_ptr); +#elif AXIS == 3 + *(__global DATA_TYPE *)tensor4D_offset(&output, px, py, pz, index) = + *((__global const DATA_TYPE *)on_value_ptr); +#endif // AXIS +} + +#endif // defined(DATA_TYPE) && defined(AXIS) && defined(DEPTH) && defined(OUTPUT_DIM_Z) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl new file mode 100644 index 000000000..96f2f9ef0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \ + defined(SRC_WIDTH) + +#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE) +#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE) + +#if defined(CONST_VAL) +/** Perform a pad operation when PaddingMode is CONSTANT + * + * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4 + * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, + * e.g. -DCONST_VAL=1.27 + * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. + * -DPAD_X_BEFORE=5 + * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. + * -DSRC_WIDTH=224 + * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile + * flag, e.g. -DSELECT_DT=float + * @note In case pad left is more than the vector size, the number of threads to skip along the X + * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g. + * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE) + * @note If pad also needs to be added to the top of the tensor, the following compile flags must be + * passed at compile time: + * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3) + * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127) + * @note If pad also needs to be added to the depth of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. + * -DPAD_Z_BEFORE=3) + * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32) + * @note If pad also needs to be added to the batch of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. + * -DPAD_W_BEFORE=3) + * -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: + * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source image in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination image in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] batch (Optional) Batch index if 4D pad must be applied + */ +__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(PAD_W_BEFORE) + , + uint batch +#endif // defined(PAD_W_BEFORE) +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + uint cond = 0; + +#if defined(PAD_W_BEFORE) + cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE); +#endif // defined(PAD_W_BEFORE) +#if defined(PAD_Z_BEFORE) + cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE); +#endif // defined(PAD_Z_BEFORE) + + if (cond) + { + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + VSTORE(VEC_SIZE) + ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr); + } + else + { + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#if defined(NUM_THREADS_TO_SKIP_X) + /* In case the pad left is greater than the vector size, and we are past the threads operating + * solely on pad values, the input pointer must be brought back along the X axis to start from + * the first non-pad values. + * + * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|: + * -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) + * < PAD_X_BEFORE + * -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, + * 3)) < PAD_X_BEFORE + * -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x * + * VEC_SIZE) == 4) values, reading |4 5| + * -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and + * check that it is >= to the current x + * -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying + * this constant by the input's step along the X axis + * -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it + * will read the desired values |0 1| + */ + src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X); +#endif // defined(NUM_THREADS_TO_SKIP_X) +#if defined(PAD_Z_BEFORE) + src.ptr -= PAD_Z_BEFORE * src_step_z; +#endif // defined(PAD_Z_BEFORE) +#if defined(PAD_W_BEFORE) + src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z; +#endif // defined(PAD_W_BEFORE) + + VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); + + VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT); + VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE); +#if defined(PAD_Y_BEFORE) + cond |= + (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE); +#endif // defined(PAD_Y_BEFORE) + VSTORE(VEC_SIZE) + (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0, + (__global DATA_TYPE *)dst.ptr); + } +} +#endif // defined(CONST_VAL) + +#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && \ + defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \ + defined(AFTER_PAD_FACT_X) + +#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1 +#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n) +#define SYMM_REFL_LEFT(x, n0, n1) \ + select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0) +#define SYMM_REFL_RIGHT(x, n0, n1) \ + select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0) + +/** Perform a pad operation when PaddingMode is SYMMETRIC + * + * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4 + * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27 + * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. + * -DPAD_X_BEFORE=5 + * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. + * -DSRC_WIDTH=224 + * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile + * flag, e.g. -DSELECT_DT=float + * @note Number of values to the left when operating across left padding must be passed using the + * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5 + * @note Number of values to the left when operating across right padding must be passed using the + * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6 + * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is + * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6 + * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the + * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5 + * @note When after pad X, starting point to read backward from must be passed using the + * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253 + * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be + * set to 0 + * @note If pad also needs to be added to the top of the tensor, the following compile flags must be + * passed at compile time: + * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3) + * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127) + * @note If pad also needs to be added to the depth of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. + * -DPAD_Z_BEFORE=3) + * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32) + * @note If the starting point to read backward from is less than the output's last element accessed + * in the X, the following compile flags must be passed at compile time to avoid negative offsets: + * -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation + * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: + * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source image in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination image in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * image + */ +__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Get current thread position + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Define conditions based on the thread X position w.r.t. pad left and right + const int x_out_first = x * VEC_SIZE; + const int x_out_last = x_out_first + VEC_SIZE; + const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE); + const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE); + const int is_inside_input = + (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE)); + const int is_across_pad_right = + (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE)); + const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE)); + + // Calculate base pointers + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes; + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Calculate input tensor's offset based on the defined conditions + int x_offset = 0; + x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left); + x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input); + x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right); + x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right); + +#if defined(AFTER_PAD_REM) + int neg_offs = x_offset < 0; + x_offset = max(x_offset, 0); +#endif // defined(AFTER_PAD_REM) + + // Load input values from the computed offset + int y_in = y; + int z_in = z; +#if defined(PAD_Y_BEFORE) + y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE); + y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, + y >= (SRC_HEIGHT + PAD_Y_BEFORE)); +#endif // defined(PAD_Y_BEFORE) +#if defined(PAD_Z_BEFORE) + z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE); + z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, + z >= (SRC_DEPTH + PAD_Z_BEFORE)); +#endif // defined(PAD_Y_BEFORE) + + src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z; + +#if SRC_WIDTH == 1 + VSTORE(VEC_SIZE) + ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr); +#else // SRC_WIDTH == 1 + + VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr); + + // Choose rearrangement policy based on the defined conditions + src_vals = + select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), + SCALAR_COND(is_across_pad_left)); + src_vals = + select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), + SCALAR_COND(is_across_pad_right)); + src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE), + SCALAR_COND((is_before_pad_left || is_after_pad_right))); +#if defined(AFTER_PAD_REM) + src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs)); +#endif // defined(AFTER_PAD_REM) + + // Store + VSTORE(VEC_SIZE) + (src_vals, 0, (__global DATA_TYPE *)dst.ptr); +#endif // SRC_WIDTH == 1 +} +#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && + // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && + // defined(AFTER_PAD_FACT_X) +#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && + // defined(SRC_WIDTH) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl index 76fda9041..532000e9e 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -138,7 +138,7 @@ __kernel void pixelwise_mul_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARAT // Multiply with a multiplier smaller than 1 out_val = - ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl index 4ae9adb0b..c829f264d 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/quantization_symm8.cl @@ -116,7 +116,7 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc // Create scale vector const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = - *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); + *(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1)); // Quantize VEC_DATA_TYPE(int, VEC_SIZE) @@ -127,10 +127,10 @@ __kernel void quantization_symm8(IMAGE_DECLARATION(input), VECTOR_DECLARATION(sc (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr); #else //! defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP( - CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / - (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), - int), - MIN_QUANT_VAL, MAX_QUANT_VAL); + CONVERT_RTE((*(__global DATA_TYPE_IN *)input.ptr) / + (*(((__global DATA_TYPE_IN *)(scale_ptr)) + get_global_id(1))), + int), + MIN_QUANT_VAL, MAX_QUANT_VAL); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl index 832ac1270..d0ef31b20 100644 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl @@ -100,12 +100,14 @@ __kernel void reduce_min_max(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(o Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE value = - *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); for (int i = 1; i < dim; ++i) { indices[axis] = i; @@ -186,16 +188,18 @@ __kernel void reduce_sum_mean(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION( Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); int indices[4] = { - get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT, - get_global_id(2) / DEPTH_OUT, + get_global_id(0), + get_global_id(1), + get_global_id(2) % DEPTH_OUT, + get_global_id(2) / DEPTH_OUT, }; DATA_TYPE sum_value = (DATA_TYPE)0; for (int i = 0; i < dim; ++i) { indices[axis] = i; - sum_value += *( - (__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); + sum_value += + *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])); } #if OP_CODE == 3 // REDUCE_SUM diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h new file mode 100644 index 000000000..cfc811cce --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_REPEAT_H +#define ARM_COMPUTE_REPEAT_H + +#include "helpers.h" + +/** Macros that help in loop unrolling */ +// Repeat macros with 3 param, excluding the implicit ID param +#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) +#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ + P_X##_DEF(1, P_A, P_B, P_C); \ + REPEAT_3_1(P_X, P_A, P_B, P_C) +#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ + P_X##_DEF(2, P_A, P_B, P_C); \ + REPEAT_3_2(P_X, P_A, P_B, P_C) +#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ + P_X##_DEF(3, P_A, P_B, P_C); \ + REPEAT_3_3(P_X, P_A, P_B, P_C) +#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ + P_X##_DEF(4, P_A, P_B, P_C); \ + REPEAT_3_4(P_X, P_A, P_B, P_C) +#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ + P_X##_DEF(5, P_A, P_B, P_C); \ + REPEAT_3_5(P_X, P_A, P_B, P_C) +#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ + P_X##_DEF(6, P_A, P_B, P_C); \ + REPEAT_3_6(P_X, P_A, P_B, P_C) +#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ + P_X##_DEF(7, P_A, P_B, P_C); \ + REPEAT_3_7(P_X, P_A, P_B, P_C) +#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ + P_X##_DEF(8, P_A, P_B, P_C); \ + REPEAT_3_8(P_X, P_A, P_B, P_C) +#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ + P_X##_DEF(9, P_A, P_B, P_C); \ + REPEAT_3_9(P_X, P_A, P_B, P_C) +#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ + P_X##_DEF(A, P_A, P_B, P_C); \ + REPEAT_3_10(P_X, P_A, P_B, P_C) +#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ + P_X##_DEF(B, P_A, P_B, P_C); \ + REPEAT_3_11(P_X, P_A, P_B, P_C) +#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ + P_X##_DEF(C, P_A, P_B, P_C); \ + REPEAT_3_12(P_X, P_A, P_B, P_C) +#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ + P_X##_DEF(D, P_A, P_B, P_C); \ + REPEAT_3_13(P_X, P_A, P_B, P_C) +#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ + P_X##_DEF(E, P_A, P_B, P_C); \ + REPEAT_3_14(P_X, P_A, P_B, P_C) +#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ + P_X##_DEF(F, P_A, P_B, P_C); \ + REPEAT_3_15(P_X, P_A, P_B, P_C) + +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \ + REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion + // does not affect preprocessing P_NUM +#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) + +// Repeat macros with 4 param, excluding the implicit ID param +#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) +#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(1, P_A, P_B, P_C, P_D); \ + REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(2, P_A, P_B, P_C, P_D); \ + REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(3, P_A, P_B, P_C, P_D); \ + REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(4, P_A, P_B, P_C, P_D); \ + REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(5, P_A, P_B, P_C, P_D); \ + REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(6, P_A, P_B, P_C, P_D); \ + REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(7, P_A, P_B, P_C, P_D); \ + REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(8, P_A, P_B, P_C, P_D); \ + REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(9, P_A, P_B, P_C, P_D); \ + REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(A, P_A, P_B, P_C, P_D); \ + REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(B, P_A, P_B, P_C, P_D); \ + REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(C, P_A, P_B, P_C, P_D); \ + REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(D, P_A, P_B, P_C, P_D); \ + REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(E, P_A, P_B, P_C, P_D); \ + REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(F, P_A, P_B, P_C, P_D); \ + REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) + +#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \ + REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of + // expansion does not affect preprocessing P_NUM +#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) + +// Macro for initializing N variables. Generates N statements that defines VAR##N = +// RHS_ACCESSOR_DEF(...) +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) + +// Macro for initializing N variables by converting the data type. Generates N statements that +// defines VAR##N = RHS_ACCESSOR_DEF(...) +#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \ + TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) +#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \ + REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) + +// Macro for adding a constant to N variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL +#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) + +// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables +// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...) +#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL +#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \ + REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) + +// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC +#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) + +// Macro for adding a two N-variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID +#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) + +// Macro for performing Max between a constant and N variables. Generates N statements that defines +// VAR##N =RHS_ACCESSOR_DEF(...) +#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) +#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) + +// Macro for performing Min between a constant and N variables. Generates N statements that defines +// VAR##N =RHS_ACCESSOR_DEF(...) +#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) +#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) + +// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N +// statements that defines VAR##N =RHS_ACCESSOR_DEF(...) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) + +// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N +// statements that defines VAR##N =RHS_ACCESSOR_DEF(...) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) + +// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables. +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + ({ \ + VEC_DATA_TYPE(int, N0) \ + VAR##ID_shift_lt0 = \ + ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ + VEC_DATA_TYPE(int, N0) \ + VAR##ID_shift_gt0 = \ + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ + VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ + }) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) + +#endif // ARM_COMPUTE_REPEAT_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl new file mode 100644 index 000000000..8da8bfc8e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Perform tensor reshape + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: All + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] input_shape Input spatial shape + * @param[in] output_shape Output spatial shape + */ +__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output), + int2 input_shape, int2 output_shape) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + + int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2)); + + // Linearize index + int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y; + + // Translate to output + int3 out_id; + out_id.x = linear_idx % output_shape.x; + out_id.y = (linear_idx / output_shape.x) % output_shape.y; + out_id.z = linear_idx / (output_shape.x * output_shape.y); + + // Store result + *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = + *((__global DATA_TYPE *)in.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl deleted file mode 100644 index e9d4696b4..000000000 --- a/compute/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -// reference: -// https://code.google.com/archive/p/ocl-radix-sort/source/default/source -// OpenCL kernel sources for the CLRadixSort class -// the #include does not exist in OpenCL -// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr -// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html -// if you find this software usefull you can cite the following work in your reports or articles: -// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. -// http://hal.archives-ouvertes.fr/hal-00596730 - -// Reference for floating point radix sort: -// http://www.codercorner.com/RadixSortRevisited.htm - -// compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms, - const int pass, __local int *loc_histo, const int n) -{ - int it = get_local_id(0); // i local number of the processor - int ig = get_global_id(0); // global number = i + g I - - int gr = get_group_id(0); // g group number - - int groups = get_num_groups(0); - int items = get_local_size(0); - - // set the local histograms to zero - for (int ir = 0; ir < _RADIX; ir++) - { - loc_histo[ir * items + it] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // range of keys that are analyzed by the work item - int size = n / groups / items; // size of the sub-list - int start = ig * size; // beginning of the sub-list - - unsigned int key; - int shortkey, k; - - // compute the index - // the computation depends on the transposition - for (int j = 0; j < size; j++) - { -#ifdef TRANSPOSE - k = groups * items * j + ig; -#else - k = j + start; -#endif - - key = *((__global unsigned int *)(in_key_buf + k)); - - // extract the group of _BITS bits of the pass - // the result is in the range 0.._RADIX-1 - shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); - - // increment the local histogram - loc_histo[shortkey * items + it]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // copy the local histogram to the global one - for (int ir = 0; ir < _RADIX; ir++) - { - d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; - } - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// initial transpose of the list for improving -// coalescent memory access -__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol, - const int nbrow, const __global int *inperm, __global int *outperm, - __local int *blockmat, __local int *blockperm, const int tilesize) -{ - - int i0 = get_global_id(0) * tilesize; // first row index - int j = get_global_id(1); // column index - - int jloc = get_local_id(1); // local column index - - // fill the cache - for (int iloc = 0; iloc < tilesize; iloc++) - { - int k = (i0 + iloc) * nbcol + j; // position in the matrix - blockmat[iloc * tilesize + jloc] = invect[k]; -#ifdef PERMUT - blockperm[iloc * tilesize + jloc] = inperm[k]; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first row index in the transpose - int j0 = get_group_id(1) * tilesize; - - // put the cache at the good place - for (int iloc = 0; iloc < tilesize; iloc++) - { - int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose - outvect[kt] = blockmat[jloc * tilesize + iloc]; -#ifdef PERMUT - outperm[kt] = blockperm[jloc * tilesize + iloc]; -#endif - } -} - -// each virtual processor reorders its data using the scanned histogram -__kernel void radixsort_reorder(__global float *in_key, __global float *out_key, - __global int *d_Histograms, const int pass, - __global int *indices_in, __global int *indices_out, - __local int *loc_histo, const int n) -{ - - int it = get_local_id(0); - int ig = get_global_id(0); - - int gr = get_group_id(0); - int groups = get_num_groups(0); - int items = get_local_size(0); - - int start = ig * (n / groups / items); - int size = n / groups / items; - - // take the histogram in the cache - for (int ir = 0; ir < _RADIX; ir++) - { - loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int newpos, shortkey, k, newpost; - unsigned int key; - - for (int j = 0; j < size; j++) - { -#ifdef TRANSPOSE - k = groups * items * j + ig; -#else - k = j + start; -#endif - float org_value = in_key[k]; - key = *(__global unsigned int *)(in_key + k); - shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1)); - - newpos = loc_histo[shortkey * items + it]; - -#ifdef TRANSPOSE - int ignew, jnew; - ignew = newpos / (n / groups / items); - jnew = newpos % (n / groups / items); - newpost = jnew * (groups * items) + ignew; -#else - newpost = newpos; -#endif - - // d_outKeys[newpost]= key; // killing line !!! - out_key[newpost] = org_value; - -#ifdef PERMUT - indices_out[newpost] = indices_in[k]; -#endif - - newpos++; - loc_histo[shortkey * items + it] = newpos; - } -} - -// perform a parallel prefix sum (a scan) on the local histograms -// (see Blelloch 1990) each workitem worries about two memories -// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp, - __global int *globsum) -{ - int it = get_local_id(0); - int ig = get_global_id(0); - int decale = 1; - int n = get_local_size(0) * 2; - int gr = get_group_id(0); - - // load input into local memory - // up sweep phase - temp[2 * it] = histo[2 * ig]; - temp[2 * it + 1] = histo[2 * ig + 1]; - - // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n >> 1; d > 0; d >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (it < d) - { - int ai = decale * (2 * it + 1) - 1; - int bi = decale * (2 * it + 2) - 1; - temp[bi] += temp[ai]; - } - decale *= 2; - } - - // store the last element in the global sum vector - // (maybe used in the next step for constructing the global scan) - // clear the last element - if (it == 0) - { - globsum[gr] = temp[n - 1]; - temp[n - 1] = 0; - } - - // down sweep phase - for (int d = 1; d < n; d *= 2) - { - decale >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (it < d) - { - int ai = decale * (2 * it + 1) - 1; - int bi = decale * (2 * it + 2) - 1; - - int t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write results to device memory - - histo[2 * ig] = temp[2 * it]; - histo[2 * ig + 1] = temp[2 * it + 1]; - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// use the global sum for updating the local histograms -// each work item updates two values -__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum) -{ - int ig = get_global_id(0); - int gr = get_group_id(0); - - int s; - - s = globsum[gr]; - - // write results to device memory - histo[2 * ig] += s; - histo[2 * ig + 1] += s; - - barrier(CLK_GLOBAL_MEM_FENCE); -} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp new file mode 100644 index 000000000..987409739 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +constexpr unsigned int vector_size = 16; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Only ARG_IDX_MAX and ARG_IDX_MIN are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, + DataType::S64); + } + if (prev_output != nullptr && prev_output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(prev_output, 1, DataType::U32, + DataType::S32, DataType::S64); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(prev_output, output); + } + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *prev_output, + ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_UNUSED(op); + // Output tensor auto initialization if not yet initialized + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(axis, 1); + DataType output_data_type = (prev_output != nullptr) ? (prev_output->data_type()) : DataType::S32; + auto_init_if_empty(*output, input->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + Window win = + calculate_max_window((prev_output != nullptr) ? (*prev_output) : (*input), Steps(vector_size)); + bool window_changed = false; + + switch (axis) + { + case 0: + { + ITensorInfo *input_tensor_access = prev_output != nullptr ? prev_output : input; + AccessWindowStatic input_access(input_tensor_access, 0, 0, + static_cast<int>(input_tensor_access->dimension(0)), 1); + AccessWindowHorizontal output_access(output, 0, 1); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + case 1: + case 2: + case 3: + { + AccessWindowHorizontal input_access(input, 0, vector_size); + AccessWindowHorizontal output_access(output, 0, vector_size); + window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_tuple(err, win); +} +} // namespace + +CLArgMinMaxLayerKernelEx::CLArgMinMaxLayerKernelEx() + : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), + _op(ReductionOperation::ARG_IDX_MAX) +{ +} + +void CLArgMinMaxLayerKernelEx::configure(const ICLTensor *input, const ICLTensor *prev_output, + ICLTensor *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, + output->info(), axis, op)); + auto win_config = validate_and_configure_window( + input->info(), (prev_output != nullptr) ? prev_output->info() : nullptr, output->info(), axis, + op); + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + _input = input; + _prev_output = prev_output; + _output = output; + _reduction_axis = axis; + _op = op; + + // Set build options + CLBuildOptions build_opts; + + build_opts.add_option_if(_prev_output != nullptr, "-DPREV_OUTPUT"); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE"); + build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN"); + build_opts.add_option("-DDATA_TYPE_OUTPUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_SELECT=" + + get_cl_signed_type_from_element_size(input->info()->element_size())); + + // Create kernel + cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange(); + std::string kernel_axis_name; + switch (axis) + { + case 0: + { + const ICLTensor *input_for_width = prev_output != nullptr ? _prev_output : _input; + build_opts.add_option("-DWIDTH=" + + support::cpp11::to_string(input_for_width->info()->dimension(0))); + + kernel_axis_name = "x"; + lws_hint = create_lws_hint_parallel_implementations(input_for_width->info()->dimension(0), + vector_size); + } + break; + case 1: + build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + kernel_axis_name = "y"; + break; + case 2: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + kernel_axis_name = "z"; + break; + case 3: + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3))); + kernel_axis_name = "w"; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "arg_min_max_ex_" + kernel_axis_name, build_opts.options())); + + // Configure kernel window + ICLKernel::configure_internal(std::get<1>(win_config), lws_hint); +} + +Status CLArgMinMaxLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *prev_output, + const ITensorInfo *output, unsigned int axis, + ReductionOperation op) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, prev_output, output, axis, op)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (prev_output != nullptr) ? prev_output->clone().get() : nullptr, + output->clone().get(), axis, op))); + return Status{}; +} + +void CLArgMinMaxLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + switch (_reduction_axis) + { + case 0: + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Reshape window + const unsigned int num_tensors = _prev_output != nullptr ? 3 : 2; + + // Set local sums buffer + unsigned int local_res_size = lws_hint()[0] * _output->info()->element_size(); + _kernel.setArg(num_arguments_per_2D_tensor() * num_tensors, local_res_size, nullptr); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + if (_prev_output != nullptr) + { + add_2D_tensor_argument(idx, _prev_output, in_slice); + } + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + } + break; + case 1: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), + _input->info()->dimension(1))); + Window in_slice = window_in.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_2D(in_slice) && + window.slide_window_slice_2D(out_slice)); + } + break; + case 2: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), + _input->info()->dimension(2))); + Window in_slice = window_in.first_slice_window_3D(); + Window out_slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_3D(in_slice) && + window.slide_window_slice_3D(out_slice)); + } + break; + case 3: + { + // Get first input and output slices + Window window_in{window}; + window_in.set(3, Window::Dimension(0, 1, 1)); + Window in_slice = window_in.first_slice_window_4D(); + Window out_slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, in_slice); + add_4D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, lws_hint()); + } while (window_in.slide_window_slice_4D(in_slice) && + window.slide_window_slice_4D(out_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp index fbc76f5e1..a5daa2410 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -43,6 +43,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/AccessWindowStatic.h" #include "support/StringSupport.h" using namespace arm_compute; @@ -55,7 +57,7 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { const TensorShape &out_shape = - TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8); @@ -68,15 +70,15 @@ Status validate_parameters(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), - "Wrong shape for output"); + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); } return Status{}; } } // namespace CLBinaryLogicalOpKernel::CLBinaryLogicalOpKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) + : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -111,13 +113,13 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor build_opts.emplace(("-DOP_CODE=" + support::cpp11::to_string(op_code))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); const std::pair<TensorShape, ValidRegion> broadcast_pair = - ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); + ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info()); const ValidRegion &valid_region = broadcast_pair.second; @@ -130,8 +132,8 @@ void CLBinaryLogicalOpKernel::configure(const ICLTensor *input1, const ICLTensor AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win_input1, input1_access) || - update_window_and_padding(win_input2, input2_access) || - update_window_and_padding(win, output_access); + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); output_access.set_valid_region(win, valid_region); @@ -151,7 +153,7 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = - (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); @@ -160,13 +162,13 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) bool has_collapsed = false; Window collapsed = - can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) - : window; + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; const TensorShape &in_shape1_collapsed = - has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = - has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; Window slice = collapsed.first_slice_window_3D(); Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); @@ -189,9 +191,9 @@ void CLBinaryLogicalOpKernel::run(const Window &window, cl::CommandQueue &queue) BorderSize CLBinaryLogicalOpKernel::border_size() const { const unsigned int replicateSize = - _output->info()->dimension(0) - - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); const unsigned int border = - std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); return BorderSize(0, border, 0, 0); } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp new file mode 100644 index 000000000..dc06bfbb3 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" + +#include "support/StringSupport.h" + +#include <cstddef> +#include <set> +#include <string> + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::U32, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == output->data_type(), + "Input and output data types must be different"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +void CLCastBoolKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Get number of elements to process per iterations + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + const std::string kernel_name = "cast_bool"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + + // Configure kernel + ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration); + + // Collapse window + const Window &full_window = window(); + Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ); + ICLKernel::configure_internal(collapsed_window); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(output->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); +} + +Status CLCastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + + return Status{}; +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index 67aaf2db6..4206f1fd4 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; @@ -61,14 +64,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace CLEmbeddingLookupKernel::CLEmbeddingLookupKernel() - : _input(nullptr), _output(nullptr), _lookups(nullptr) + : _input(nullptr), _output(nullptr), _lookups(nullptr) { } @@ -77,8 +80,8 @@ Status CLEmbeddingLookupKernel::validate(const ITensorInfo *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -108,8 +111,8 @@ void CLEmbeddingLookupKernel::configure(const ICLTensor *input, ICLTensor *outpu build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp index 3bfe3e407..62da2376e 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -45,6 +45,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/core/UtilsEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" using namespace arm_compute; @@ -62,15 +66,15 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis); + input->tensor_shape(), indices->tensor_shape(), actual_axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -86,7 +90,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions())); std::unique_ptr<ITensorInfo> output_info = input->clone(); output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), actual_axis)); + input->tensor_shape(), indices->tensor_shape(), actual_axis)); // Output auto initialization if not yet initialized auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); @@ -100,7 +104,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLGatherExKernel::CLGatherExKernel() - : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) + : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0) { } @@ -109,11 +113,11 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), indices->info(), output->info(), axis)); + validate_arguments(input->info(), indices->info(), output->info(), axis)); // Configure kernel window auto win_config = - validate_and_configure_window(input->info(), indices->info(), output->info(), axis); + validate_and_configure_window(input->info(), indices->info(), output->info(), axis); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); _input = input; @@ -133,7 +137,7 @@ void CLGatherExKernel::configure(const ICLTensor *input, const ICLTensor *indice // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("gather_ex", build_opts.options())); ICLKernel::configure_internal(win_config.second); } @@ -144,7 +148,7 @@ Status CLGatherExKernel::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis) - .first); + .first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index 930e7c944..03ca6ddcb 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" using namespace arm_compute; @@ -61,8 +62,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen input_access.set_valid_region(win, output->valid_region()); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_pair(err, win); } } // namespace @@ -78,8 +79,8 @@ Status CLHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(hits, 1, DataType::U8, DataType::QASYMM8); @@ -102,7 +103,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -111,9 +112,9 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso _hits = hits; // Make _lookup_indices tensor - _lookup_indices = support::cpp14::make_unique<CLTensor>(); + _lookup_indices = std::make_unique<CLTensor>(); _lookup_indices->allocator()->init( - TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); + TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); _lookup_indices->allocator()->allocate(); // Set kernel build options @@ -127,8 +128,8 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso build_opts.emplace("-DNUM_DIMS=" + support::cpp11::to_string(_input->info()->num_dimensions())); // Create kernel - _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name.str(), build_opts)); // Configure kernel window auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -148,7 +149,7 @@ void CLHashtableLookupKernel::run(const Window &window, cl::CommandQueue &queue) // Set values of hits const int32_t *lookups_buf = - reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); + reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_lookups)->buffer()); const int32_t *keys_buf = reinterpret_cast<int32_t *>(const_cast<ICLTensor *>(_keys)->buffer()); uint8_t *hits_buf = reinterpret_cast<uint8_t *>(_hits->buffer()); int32_t *lookup_indices_buf = reinterpret_cast<int32_t *>(_lookup_indices->buffer()); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp index 61c14d271..945af3c51 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -42,12 +42,16 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include "support/ToolchainSupport.h" @@ -94,8 +98,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace CLInstanceNormalizationLayerKernelEx::CLInstanceNormalizationLayerKernelEx() - : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), - _run_in_place(false) + : _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(1e-12), + _run_in_place(false) { } @@ -132,7 +136,7 @@ void CLInstanceNormalizationLayerKernelEx::configure(ICLTensor *input, ICLTensor // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("instance_normalization_ex", build_opts.options())); // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info()); @@ -147,7 +151,7 @@ Status CLInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp new file mode 100644 index 000000000..a00fc5e2e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" + +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {} + +void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window) +{ + configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window); +} + +void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, + const PixelValue &constant_value, Window *window) +{ + ARM_COMPUTE_UNUSED(compile_context); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window)); + + _tensor = tensor; + + const DataType data_type = tensor->info()->data_type(); + const int vec_size_x = 16 / tensor->info()->element_size(); + + // Create and update the window (if needed) + _full_window = calculate_max_window(*tensor->info()); + Window win = _full_window; + if (window != nullptr) + { + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); + win = *window; + } + + const int output_width_x = win.num_iterations(0); + const bool multi_access_x = output_width_x >= vec_size_x; + const bool remainder_x = output_width_x % vec_size_x > 0; + + if (multi_access_x) + { + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max<int>(output_width_x - vec_size_x, 0))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options())); +} + +Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, + Window *window) +{ + ARM_COMPUTE_UNUSED(tensor); + ARM_COMPUTE_UNUSED(constant_value); + if (window != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); + } + return Status{}; +} + +void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Collapse all the batches on the third + Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _tensor, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp index 6b27c9917..da7437e97 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -40,15 +40,19 @@ #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" using namespace arm_compute; @@ -99,7 +103,7 @@ std::tuple<Status, Window> validate_and_configure_window(const ITensorInfo *inpu } // namespace CLMultiplyScaleFactorKernel::CLMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -108,7 +112,7 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -123,9 +127,9 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen Window win = calculate_max_window(*output->info()); if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -134,11 +138,11 @@ void CLMultiplyScaleFactorKernel::configure(const ICLTensor *input, const ICLTen build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + multi_access_x, "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("multiply_scale_factor", build_opts.options())); } Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, @@ -147,7 +151,7 @@ Status CLMultiplyScaleFactorKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp index 643c8b110..cd5e571e9 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; @@ -80,9 +83,9 @@ void CLNegKernel::configure(const ICLTensor *input, ICLTensor *output) std::set<std::string> build_opts; build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); build_opts.emplace( - ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("neg_tensor", build_opts)); // Configure window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp new file mode 100644 index 000000000..4c4cbe710 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + +#include "support/StringSupport.h" +#include <string> +namespace arm_compute +{ +namespace +{ +inline Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, on_value, output); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(depth <= 0); + ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= output->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); + } + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *indices, + const ITensorInfo *on_value, + ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output, indices); + const uint32_t actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + // Output auto initialization if not yet initialized + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_onehot_shape_ex( + indices->tensor_shape(), static_cast<uint32_t>(depth), actual_axis); + auto_init_if_empty((*output), output_shape, 1, on_value->data_type()); + // Create window + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + return std::make_pair(Status{}, win); +} +} // namespace +CLOneHotKernel::CLOneHotKernel() + : _indices(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), + _is_off_value_memset(false) +{ +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = false; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON_NULLPTR(off_value->info()); + ARM_COMPUTE_ERROR_ON(off_value->info()->tensor_shape().total_size() != 1); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + _off_value = off_value; + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + _is_off_value_memset = true; + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, on_value, output); + configure_common(indices, on_value, output, depth, axis); +} +void CLOneHotKernel::configure_common(const ICLTensor *indices, const ICLTensor *on_value, + ICLTensor *output, int depth, int axis) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(indices->info(), on_value->info(), output->info(), depth, axis)); + // Configure kernel window + auto win_config = + validate_and_configure_window(indices->info(), on_value->info(), output->info(), depth, axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + if (_is_off_value_memset) + { + // Replace window with calculated by infices info + win_config.second = calculate_max_window(*indices->info(), Steps()); + } + _indices = indices; + _on_value = on_value; + _output = output; + const auto actual_axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size( + data_size_from_type(on_value->info()->data_type()))); + build_opts.add_option("-DAXIS=" + support::cpp11::to_string(actual_axis)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth)); + build_opts.add_option("-DOUTPUT_DIM_Z=" + + support::cpp11::to_string(output->info()->dimension(2))); + // Create kernel + const std::string kernel_name = _is_off_value_memset ? "one_hot_only_on_value" : "one_hot"; + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); + ICLKernel::configure_internal(win_config.second); +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(off_value); + ARM_COMPUTE_RETURN_ERROR_ON(off_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +Status CLOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *output, int depth, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(indices, on_value, output, depth, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(indices->clone().get(), + on_value->clone().get(), + output->clone().get(), depth, axis) + .first); + return Status{}; +} +void CLOneHotKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + unsigned int idx = 0; + add_3D_tensor_argument(idx, _indices, window_collapsed); + add_1D_tensor_argument(idx, _on_value, window_collapsed); + if (!_is_off_value_memset) + { + add_1D_tensor_argument(idx, _off_value, window_collapsed); + } + add_4D_tensor_argument(idx, _output, window_collapsed); + enqueue(queue, *this, window_collapsed, lws_hint()); +} + +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp new file mode 100644 index 000000000..b6efeac35 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" + +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(constant_value); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions()); + if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) + { + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3); + + const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); + for (size_t i = 0; i < padding.size(); ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect)); + ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect)); + } + } + + if (output->total_size() > 0) + { + TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); + } + + return Status{}; +} + +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode, + unsigned int &num_elems_processed_per_iteration) +{ + ARM_COMPUTE_UNUSED(constant_value, mode); + + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); + auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape)); + + num_elems_processed_per_iteration = + std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type()))); + if (input->dimension(0) < num_elems_processed_per_iteration) + { + num_elems_processed_per_iteration = + 1 << static_cast<unsigned int>(std::log2(input->dimension(0))); + } + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + const int input_start_x = + mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0; + const int input_start_y = + (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0; + + AccessWindowRectangle input_access(input, input_start_x, input_start_y, + num_elems_processed_per_iteration, 1); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + const bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLPadLayerKernelEx::CLPadLayerKernelEx() + : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false) +{ +} + +void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, + mode); +} + +void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input, + ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_UNUSED(compile_context); + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), padding, constant_value, mode)); + + _input = input; + _output = output; + _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3); + + // Configure window + unsigned int vec_size; + auto win_config = validate_and_configure_window(input->info(), output->info(), padding, + constant_value, mode, vec_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set build options + std::string kernel_name = "pad_layer_"; + + const DataType &data_type = input->info()->data_type(); + const unsigned int input_width = input->info()->dimension(0); + const unsigned int input_height = input->info()->dimension(1); + const unsigned int input_depth = input->info()->dimension(2); + const unsigned int pad_x_before = padding.at(0).first; + const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; + const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; + const unsigned int pad_right_start = input_width + pad_x_before; + + _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0; + _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0; + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); + if (padding.size() > 1) + { + build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); + + if (padding.size() > 2) + { + build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before)); + build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth)); + } + } + + switch (mode) + { + case PaddingMode::CONSTANT: + { + kernel_name += "constant"; + + build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type)); + build_opts.add_option_if(pad_x_before >= vec_size, + "-DNUM_THREADS_TO_SKIP_X=" + + support::cpp11::to_string(pad_x_before / vec_size)); + + if (_4d_enabled) + { + build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first)); + build_opts.add_option("-DSRC_BATCH=" + + support::cpp11::to_string(input->info()->dimension(3))); + } + + break; + } + case PaddingMode::SYMMETRIC: + case PaddingMode::REFLECT: + { + kernel_name += "symmetric_reflect"; + + const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); + + const unsigned int pad_x_before_remainder = pad_x_before % vec_size; + const unsigned int pad_x_after_remainder = pad_right_start % vec_size; + const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect; + const unsigned int output_last_x = + ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); + + build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect)); + build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + + support::cpp11::to_string(pad_x_before_remainder)); + build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + + support::cpp11::to_string(pad_x_after_remainder)); + build_opts.add_option( + "-DPAD_X_BEFORE_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); + build_opts.add_option( + "-DPAD_X_AFTER_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); + build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x)); + build_opts.add_option_if(after_pad_fact_x < output_last_x, + "-DAFTER_PAD_REM=" + + support::cpp11::to_string(after_pad_fact_x % vec_size)); + + break; + } + default: + ARM_COMPUTE_ERROR("Padding mode not supported."); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); +} + +Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + unsigned int vec_size; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + output->clone().get(), padding, + constant_value, mode, vec_size) + .first); + + return Status{}; +} + +void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window win_in = window; + win_in.adjust(Window::DimX, _input_start_x, true); + win_in.adjust(Window::DimY, _input_start_y, true); + + Window slice_out = window.first_slice_window_3D(); + Window slice_in = win_in.first_slice_window_3D(); + unsigned int batch = 0; + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + if (_4d_enabled) + { + add_argument<unsigned int>(idx, batch++); + } + + enqueue(queue, *this, slice_out, lws_hint()); + } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp index 1a7a18cfa..9aa815f55 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -40,15 +40,19 @@ #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" namespace arm_compute @@ -87,9 +91,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen if (multi_access_x) { - win.set(Window::DimX, - Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), - vec_size_x)); + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } Coordinates coord; @@ -101,7 +105,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } // namespace CLQuantizationSymmetricKernel::CLQuantizationSymmetricKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr) { } @@ -110,7 +114,7 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT { ARM_COMPUTE_ERROR_ON_NULLPTR(input, scale_factor, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -132,11 +136,11 @@ void CLQuantizationSymmetricKernel::configure(const ICLTensor *input, const ICLT build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if( - multi_access_x, "-DLAST_ACCESSED_X=" + - support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + multi_access_x, + "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); + CLKernelLibraryEx::get().create_kernel("quantization_symm8", build_opts.options())); } Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, @@ -145,7 +149,7 @@ Status CLQuantizationSymmetricKernel::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, scale_factor, output)); ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), output->clone().get()).first); + validate_and_configure_window(input->clone().get(), output->clone().get()).first); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index 06c2579f2..70374ba61 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; @@ -63,7 +66,7 @@ const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis, - ReduceOperation op) + ReductionOperation op) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -74,7 +77,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32, DataType::S32); - if (op == ReduceOperation::SUM) + if (op == ReductionOperation::SUM) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8, "Not support QASYMM8, yet"); @@ -98,7 +101,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c CLReduceOperationKernel::CLReduceOperationKernel() : _input(nullptr), _output(nullptr), _axis() {} void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *output, - const uint32_t axis, ReduceOperation op) + const uint32_t axis, ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -114,22 +117,22 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu // Construct kernel name std::string kernel_name; int op_code = 0; - if (op == ReduceOperation::MAX) + if (op == ReductionOperation::MAX) { kernel_name = "reduce_min_max"; op_code = 1; } - else if (op == ReduceOperation::MIN) + else if (op == ReductionOperation::MIN) { kernel_name = "reduce_min_max"; op_code = 2; } - else if (op == ReduceOperation::SUM) + else if (op == ReductionOperation::SUM) { kernel_name = "reduce_sum_mean"; op_code = 3; } - else if (op == ReduceOperation::MEAN) + else if (op == ReductionOperation::MEAN_SUM) { kernel_name = "reduce_sum_mean"; op_code = 4; @@ -145,7 +148,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu // Create kernel _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); // Configure kernel window Window win = calculate_max_window(*output_info, Steps()); @@ -158,7 +161,7 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu } Status CLReduceOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const uint32_t axis, ReduceOperation op) + const uint32_t axis, ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp index 8d8853c81..c9d6dc31c 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -40,7 +40,7 @@ #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" -#include "arm_compute/core/AccessWindowStatic.h" +#include "src/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -48,6 +48,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include <climits> @@ -94,8 +98,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; return std::make_tuple(err, win); } } // namespace @@ -115,7 +119,7 @@ void CLScaleFactorSymm8Kernel::configure(const ICLTensor *input, ICLTensor *outp // Create kernel _kernel = static_cast<cl::Kernel>( - CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); + CLKernelLibraryEx::get().create_kernel("scale_factor_symm8", build_opts)); auto win_config = validate_and_configure_window(input->info(), output->info()); @@ -128,7 +132,7 @@ Status CLScaleFactorSymm8Kernel::validate(const ITensorInfo *input, const ITenso { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); ARM_COMPUTE_RETURN_ON_ERROR( - std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp deleted file mode 100644 index 480532388..000000000 --- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPOneHotKernelEx.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" - -namespace arm_compute -{ -CPPOneHotKernelEx::CPPOneHotKernelEx() - : _indices(nullptr), _depth(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr), - _axis(-1) -{ -} - -void CPPOneHotKernelEx::configure(const ITensor *indices, const ITensor *depth, - const ITensor *on_value, const ITensor *off_value, - ITensor *output, const int axis) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(indices, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(indices, depth, on_value, off_value, axis)); - - _indices = indices; - _depth = depth; - _on_value = on_value; - _off_value = off_value; - _output = output; - _axis = axis; - - ICPPKernel::configure(Window()); // Default 1 iteration window -} - -Status CPPOneHotKernelEx::validate(const ITensor *indices, const ITensor *depth, - const ITensor *on_value, const ITensor *off_value, - const int axis) -{ - ARM_COMPUTE_UNUSED(on_value, off_value); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(depth, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->info()->num_dimensions() != 1, - "Only 1D indices are supported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != -1, "Only axis = -1 is supported."); - return Status{}; -} - -bool CPPOneHotKernelEx::is_parallelisable() const { return false; } - -void CPPOneHotKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window); - - const auto num_indices = _indices->info()->dimension(0); - const auto depth = *reinterpret_cast<int32_t *>(_depth->ptr_to_element(Coordinates{0})); - const auto dtype = _output->info()->data_type(); - switch (dtype) - { - case DataType::F32: - { - const auto on_value = *reinterpret_cast<float *>(_on_value->ptr_to_element(Coordinates{0})); - const auto off_value = *reinterpret_cast<float *>(_off_value->ptr_to_element(Coordinates{0})); - for (size_t i = 0; i < num_indices; ++i) - { - const auto index = *reinterpret_cast<int32_t *>(_indices->ptr_to_element(Coordinates{i})); - for (int d = 0; d < depth; ++d) - *reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(d, i))) = - (d == index) ? on_value : off_value; - } - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp deleted file mode 100644 index 254c33ea9..000000000 --- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2016-2018 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" - -#include <algorithm> -#include "arm_compute/core/Types.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Window.h" - -namespace -{ -void store_quantized_int32(uint8_t *output_ptr, const int32x4x4_t &out) -{ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); - vst1q_u8(output_ptr, vcombine_u8(pa, pb)); -} - -using namespace arm_compute; -template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> -void elementwise_op_templ( - const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, - OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, - OutputScalarType *)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); - - if (is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = - reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = - *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, - non_broadcast_input_ptr, broadcast_value, - output_ptr, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = - (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, - !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, - [&](const Coordinates &) { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = - reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = - reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, - input1_ptr, input2_ptr, output_ptr); - for (; x < window_end_x; ++x) - { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); - } -} - -} // namespace - -namespace arm_compute -{ - -float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, - const float32x4_t &scale) -{ - qasymm8x16_t x = vld1q_u8(input1_ptr); - const float32x4x4_t out = {{ - vmulq_f32( - vcvtq_f32_s32(vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), - scale), - vmulq_f32( - vcvtq_f32_s32(vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), - scale), - vmulq_f32( - vcvtq_f32_s32(vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), - scale), - vmulq_f32( - vcvtq_f32_s32(vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), - scale), - }}; - return out; -} - -void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, - const float32x4_t &invscale) -{ - int32x4x4_t out = {{ - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - }}; - store_quantized_int32(output_ptr, out); -} - -float32x4x4_t dup_quantized(uint8_t broadcast_value, int offset, float scale) -{ - const qasymm8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); - const int32x4_t voffset = vdupq_n_s32(offset); - const float32x4_t vscale = vdupq_n_f32(scale); - - const float32x4x4_t broadcast_vector = {{ - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( - vmovl_u8(vget_low_u8(broadcast_value_vec))))), - voffset)), - vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( - vmovl_u8(vget_low_u8(broadcast_value_vec))))), - voffset)), - vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16( - vmovl_u8(vget_high_u8(broadcast_value_vec))))), - voffset)), - vscale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16( - vmovl_u8(vget_high_u8(broadcast_value_vec))))), - voffset)), - vscale), - }}; - return broadcast_vector; -} - -void elementwise_op_quantized( - const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, QuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, - float32x4_t, float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, - int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16; - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0); - - UniformQuantizationInfo qinfo = out->info()->quantization_info().uniform(); - const float output_scale = qinfo.scale; - const int output_offset = qinfo.offset; - - // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from - // zero) - const float32x4_t voffseto = vdupq_n_f32(output_offset + 0.5f); - const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale); - - if (is_broadcast_across_x) - { - // Select the broadcast input on the X axis - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - const UniformQuantizationInfo broadcast_qinfo = - broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = - non_broadcast_tensor->info()->quantization_info().uniform(); - - const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); - const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop( - win, - [&](const Coordinates &) { - const auto non_broadcast_input_ptr = - reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = - dup_quantized(broadcast_value, broadcast_qinfo.offset, broadcast_qinfo.scale); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, - non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, - invvscaleo, !is_broadcast_input_2); - for (; x < window_end_x; ++x) - { - const float afs = - dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = - (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, - out->info()->quantization_info()); - } - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Input1 quantization info - UniformQuantizationInfo qinfo = in1->info()->quantization_info().uniform(); - const int32x4_t voffset1 = vdupq_n_s32(qinfo.offset); - const float32x4_t vscale1 = vdupq_n_f32(qinfo.scale); - - // Input2 quantization info - qinfo = in2->info()->quantization_info().uniform(); - const int32x4_t voffset2 = vdupq_n_s32(qinfo.offset); - const float32x4_t vscale2 = vdupq_n_f32(qinfo.scale); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const QuantizationInfo input1_qinfo = in1->info()->quantization_info(); - const QuantizationInfo input2_qinfo = in2->info()->quantization_info(); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, - [&](const Coordinates &) { - const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, - input1_ptr, input2_ptr, output_ptr, voffset1, - voffset2, vscale1, vscale2, voffseto, invvscaleo); - for (; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = - (*scalar_func)(afs, bfs, out->info()->quantization_info()); - } - }, - input1, input2, output); - } -} - -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - float (*scalar_func)(const float &, const float &), - int (*broadcast_func)(int, int, int, const float *, const float &, float *, - const bool), - int (*neon_func)(int, int, int, const float *, const float *, float *)) -{ - elementwise_op_templ<float, float, float32x4_t>(in1, in2, out, window, scalar_func, - broadcast_func, neon_func); -} - -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const uint8_t &, const uint8_t &), - int (*broadcast_func)(int, int, int, const uint8_t *, const uint8_t &, - uint8_t *, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *)) -{ - elementwise_op_templ<uint8_t, uint8_t, uint8x16_t>(in1, in2, out, window, scalar_func, - broadcast_func, neon_func); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp deleted file mode 100644 index 648705ba9..000000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEActivationLayerKernelEx.cpp +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h" - -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/NEFixedPoint.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/NESymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" - -#include <arm_neon.h> -#include <array> -#include <cmath> -#include <map> -#include <set> - -using namespace arm_compute; -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfo &activation_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); - - static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations = { - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH}; - static std::set<ActivationLayerInfo::ActivationFunction> qsymm16_supported_activations = { - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH}; - const DataType data_type = input->data_type(); - const QuantizationInfo &oq_info = - (output != nullptr) ? output->quantization_info() : input->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - is_data_type_quantized_asymmetric(data_type) && - (qasymm8_supported_activations.count(f_act) == 0), - "For QASYMM8 only tanh, logistic, relu and lower/upper bounded relu are supported"); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && - (qsymm16_supported_activations.count(f_act) == 0), - "For QSYMM16 only tanh and logistic are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && - (f_act == ActivationLayerInfo::ActivationFunction::TANH) && - (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(data_type) && - (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && - (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && - (f_act == ActivationLayerInfo::ActivationFunction::TANH) && - (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && - (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && - (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - - // Checks performed when output is configured - if ((output != nullptr) && (output->total_size() != 0)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - } - - return Status{}; -} - -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - // Configure kernel window - Window win = calculate_max_window(*input, Steps()); - - if (output != nullptr) - { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output, *input->clone()); - - // NEActivationLayerKernelEx doesn't need padding so update_window_and_padding() can be skipped - Coordinates coord; - coord.set_num_dimensions(output->num_dimensions()); - output->set_valid_region(ValidRegion(coord, output->tensor_shape())); - } - - return std::make_pair(Status{}, win); -} - -inline uint32x4_t vreinterpret_unsigend_int(const float32x4_t &vec) -{ - return vreinterpretq_u32_f32(vec); -} - -inline float32x4_t vreinterpret_floating_point(const uint32x4_t &vec) -{ - return vreinterpretq_f32_u32(vec); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -inline uint16x8_t vreinterpret_unsigend_int(const float16x8_t &vec) -{ - return vreinterpretq_u16_f16(vec); -} -inline float16x8_t vreinterpret_floating_point(const uint16x8_t &vec) -{ - return vreinterpretq_f16_u16(vec); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ -} // namespace - -NEActivationLayerKernelEx::NEActivationLayerKernelEx() - : _input(nullptr), _output(nullptr), _func(nullptr), _act_info() -{ -} - -void NEActivationLayerKernelEx::configure(ITensor *input, ITensor *output, - ActivationLayerInfo activation_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input); - - _input = input; - _act_info = activation_info; - _output = input; - - // Out-of-place calculation - if (output != nullptr) - { - _output = output; - } - - // Disabled activation, thus no operation needed - if (!activation_info.enabled()) - { - _func = nullptr; - } - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( - input->info(), (output != nullptr) ? output->info() : nullptr, activation_info)); - - // Activation functions : FP32 - static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 = { - {ActivationFunction::ABS, - &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float>}, - {ActivationFunction::LINEAR, - &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float>}, - {ActivationFunction::LOGISTIC, - &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float>}, - {ActivationFunction::RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float>}, - {ActivationFunction::BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float>}, - {ActivationFunction::LU_BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float>}, - {ActivationFunction::LEAKY_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float>}, - {ActivationFunction::SOFT_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float>}, - {ActivationFunction::ELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float>}, - {ActivationFunction::SQRT, - &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float>}, - {ActivationFunction::SQUARE, - &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float>}, - {ActivationFunction::TANH, - &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float>}, - {ActivationFunction::IDENTITY, - &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float>}, - }; - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // Activation functions : FP16 - static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 = { - {ActivationFunction::ABS, - &NEActivationLayerKernelEx::activation<ActivationFunction::ABS, float16_t>}, - {ActivationFunction::LINEAR, - &NEActivationLayerKernelEx::activation<ActivationFunction::LINEAR, float16_t>}, - {ActivationFunction::LOGISTIC, - &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, float16_t>}, - {ActivationFunction::RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, float16_t>}, - {ActivationFunction::BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, float16_t>}, - {ActivationFunction::LU_BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, float16_t>}, - {ActivationFunction::LEAKY_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::LEAKY_RELU, float16_t>}, - {ActivationFunction::SOFT_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::SOFT_RELU, float16_t>}, - {ActivationFunction::ELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::ELU, float16_t>}, - {ActivationFunction::SQRT, - &NEActivationLayerKernelEx::activation<ActivationFunction::SQRT, float16_t>}, - {ActivationFunction::SQUARE, - &NEActivationLayerKernelEx::activation<ActivationFunction::SQUARE, float16_t>}, - {ActivationFunction::TANH, - &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, float16_t>}, - {ActivationFunction::IDENTITY, - &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, float16_t>}, - }; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ - - // Activation functions : QASYMM8 - static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 = { - {ActivationFunction::LOGISTIC, - &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qasymm8_t>}, - {ActivationFunction::BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::BOUNDED_RELU, qasymm8_t>}, - {ActivationFunction::LU_BOUNDED_RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t>}, - {ActivationFunction::RELU, - &NEActivationLayerKernelEx::activation<ActivationFunction::RELU, qasymm8_t>}, - {ActivationFunction::TANH, - &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qasymm8_t>}, - {ActivationFunction::IDENTITY, - &NEActivationLayerKernelEx::activation<ActivationFunction::IDENTITY, qasymm8_t>}, - }; - - // Activation functions : QSYMM16 - static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qsymm16 = { - {ActivationFunction::LOGISTIC, - &NEActivationLayerKernelEx::activation<ActivationFunction::LOGISTIC, qsymm16_t>}, - {ActivationFunction::TANH, - &NEActivationLayerKernelEx::activation<ActivationFunction::TANH, qsymm16_t>}, - }; - - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - _func = act_map_qasymm8[activation_info.activation()]; - break; - case DataType::QSYMM16: - _func = act_map_qsymm16[activation_info.activation()]; - break; - case DataType::F32: - _func = act_map_f32[activation_info.activation()]; - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func = act_map_f16[activation_info.activation()]; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Unsupported data type."); - } - - // Configure kernel window - auto win_config = - validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICPPKernel::configure(win_config.second); -} - -template <ActivationLayerInfo::ActivationFunction F, typename T> -typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -NEActivationLayerKernelEx::activation(const Window &window) -{ - /** NEON vector tag type. */ - using ExactTagType = - typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const ActivationFunction act = F; - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(_input, win_collapsed); - Iterator output(_output, win_collapsed); - - const auto infinity = wrapper::vdup_n(std::numeric_limits<T>::infinity(), ExactTagType{}); - const auto epsilon = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType{}); - const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); - const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - const auto va = wrapper::vdup_n(static_cast<T>(_act_info.a()), ExactTagType{}); - const auto vb = wrapper::vdup_n(static_cast<T>(_act_info.b()), ExactTagType{}); - const auto a = static_cast<T>(_act_info.a()); - const auto b = static_cast<T>(_act_info.b()); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - - wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - - // Compute S elements per iteration - int x = window_start_x; - - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - switch (act) - { - case ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationFunction::LOGISTIC: - // exp(-vin) - tmp = wrapper::vexpq(wrapper::vneg(vin)); - - // NaN -> INF - tmp = vreinterpret_floating_point(wrapper::vorr( - wrapper::vand(wrapper::vnot(wrapper::vceq(tmp, tmp)), - vreinterpret_unsigend_int(infinity)), - wrapper::vand(wrapper::vceq(tmp, tmp), vreinterpret_unsigend_int(tmp)))); - - // 1 / 1 + tmp - tmp = wrapper::vinv(wrapper::vadd(const_1, tmp)); - break; - case ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationFunction::SOFT_RELU: - tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))); - break; - case ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, - wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationFunction::SQRT: - tmp = wrapper::vinv(wrapper::vinvsqrt(vin + epsilon)); - break; - case ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationFunction::IDENTITY: - tmp = vin; - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - const T in = *(reinterpret_cast<const T *>(input_ptr + x)); - T tmp; - switch (act) - { - case ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationFunction::LOGISTIC: - tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); - break; - case ActivationFunction::RELU: - tmp = std::max<T>(static_cast<T>(0), in); - break; - case ActivationFunction::BOUNDED_RELU: - tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min<T>(a, std::max<T>(b, in)); - break; - case ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationFunction::SOFT_RELU: - tmp = std::log(static_cast<T>(1) + std::exp(in)); - break; - case ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationFunction::IDENTITY: - tmp = in; - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} - -template <ActivationLayerInfo::ActivationFunction F, typename T> -typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type -NEActivationLayerKernelEx::activation(const Window &window) -{ - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const ActivationFunction act = F; - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(_input, win_collapsed); - Iterator output(_output, win_collapsed); - - const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform(); - const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in)); - const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in)); - const qasymm8_t a = quantize_qasymm8(_act_info.a(), qi_in); - const qasymm8_t b = quantize_qasymm8(_act_info.b(), qi_in); - const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); - const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); - const auto vconst_1 = vdupq_n_f32(1.f); - const float32x4_t va_f32 = vdupq_n_f32(_act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b()); - const float a_f32 = _act_info.a(); - const float b_f32 = _act_info.b(); - - // Initialise scale/offset for re-quantization - float s = qi_in.scale / qi_out.scale; - float o = -qi_in.offset * s + qi_out.offset; - float32x4_t vs = vdupq_n_f32(s); - float32x4_t vo = vdupq_n_f32(o); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - - wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - - // Compute S elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if (act == ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_u8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if (act == ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if (act == ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8(tmp, vs, vo); - } - else if (act == ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = {{ - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[0])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[1])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[2])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[3])))), - }}; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if (act == ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = {{ - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - }}; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - T in = *(reinterpret_cast<const T *>(input_ptr + x)); - T tmp; - if (act == ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255)); - } - else if (act == ActivationFunction::BOUNDED_RELU) - { - tmp = std::min(a, std::max(const_0, in)); - tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255)); - } - else if (act == ActivationFunction::LU_BOUNDED_RELU) - { - tmp = std::min(a, std::max(b, in)); - tmp = std::max<int32_t>(0, std::min<int32_t>(tmp * s + o, 255)); - } - else if (act == ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if (act == ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} - -template <ActivationLayerInfo::ActivationFunction F, typename T> -typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type -NEActivationLayerKernelEx::activation(const Window &window) -{ - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast<int>(window.x().start()); - const auto window_end_x = static_cast<int>(window.x().end()); - const ActivationFunction act = F; - - Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); - win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(_input, win_collapsed); - Iterator output(_output, win_collapsed); - - const UniformQuantizationInfo qi_in = _input->info()->quantization_info().uniform(); - const UniformQuantizationInfo qi_out = _output->info()->quantization_info().uniform(); - const auto vconst_1 = vdupq_n_f32(1.f); - const float32x4_t va_f32 = vdupq_n_f32(_act_info.a()); - const float32x4_t vb_f32 = vdupq_n_f32(_act_info.b()); - const float a_f32 = _act_info.a(); - const float b_f32 = _act_info.b(); - - execute_window_loop( - win_collapsed, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - - wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - ARM_COMPUTE_UNUSED(tmp); - - // Compute S elements per iteration - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if (act == ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = {{ - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[0])))), - wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg( - vin_deq.val[1])))), - }}; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else if (act == ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = {{ - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), - wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - }}; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - wrapper::vstore(output_ptr + x, tmp); - } - - // Compute left-over elements - for (; x < window_end_x; ++x) - { - T in = *(reinterpret_cast<const T *>(input_ptr + x)); - T tmp; - if (act == ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if (act == ActivationFunction::TANH) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - *(output_ptr + x) = tmp; - } - }, - input, output); -} - -Status NEActivationLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR( - validate_and_configure_window(input->clone().get(), - (output != nullptr) ? output->clone().get() : nullptr) - .first); - - return Status{}; -} - -void NEActivationLayerKernelEx::run(const Window &window, const ThreadInfo &info) -{ - // Early exit on disabled activation - if (!_act_info.enabled()) - { - return; - } - - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(_func == nullptr); - - (this->*_func)(window); -} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp deleted file mode 100644 index 32d7d6237..000000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" - -#include <algorithm> -#include <arm_neon.h> -#include <map> -#include <string> - -namespace arm_compute -{ -class Coordinates; -} // namespace arm_compute - -namespace arm_compute -{ - -template <BinaryLogicalOperation op, typename ScalarType> -inline ScalarType elementwise_logic_op_scalar(const ScalarType &a, const ScalarType &b) -{ - auto res = ScalarType(0); - - switch (op) - { - case BinaryLogicalOperation::AND: - res = a & b; - break; - case BinaryLogicalOperation::OR: - res = a | b; - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template <BinaryLogicalOperation op, typename VectorType> -inline VectorType elementwise_logic_op(const VectorType &a, const VectorType &b) -{ - VectorType res = {0, 0, 0, 0}; - - switch (op) - { - case BinaryLogicalOperation::AND: - res = wrapper::vand(a, b); - break; - case BinaryLogicalOperation::OR: - res = wrapper::vorr(a, b); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } - return res; -} - -template <BinaryLogicalOperation op> -inline uint8x16x4_t elementwise_logic_op(const uint8x16x4_t &a, const uint8x16x4_t &b) -{ - uint8x16x4_t out = {{ - elementwise_logic_op<op>(a.val[0], b.val[0]), elementwise_logic_op<op>(a.val[1], b.val[1]), - elementwise_logic_op<op>(a.val[2], b.val[2]), elementwise_logic_op<op>(a.val[3], b.val[3]), - }}; - return out; -} - -template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> -inline VectorType elementwise_logic_op_broadcast(const VectorType &a, - const ScalarType &broadcast_value, - const bool reorder) -{ - VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_logic_op<op>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); -} - -template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> -inline int elementwise_logic_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, - ScalarType *output_ptr) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, elementwise_logic_op<op>(a, b)); - } - return x; -} - -template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> -inline int elementwise_logic_op_broadcast_loop(int window_start_x, int window_end_x, - int window_step_x, - const ScalarType *non_broadcast_input_ptr, - const ScalarType &broadcast_value, - ScalarType *output_ptr, const bool reorder) -{ - int x = window_start_x; - for (; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, - elementwise_logic_op_broadcast<op>(a, broadcast_value, reorder)); - } - return x; -} - -template <BinaryLogicalOperation op, typename ScalarType, typename VectorType> -void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, - const Window &window) -{ - elementwise_op(in1, in2, out, window, &elementwise_logic_op_scalar<op, ScalarType>, - &elementwise_logic_op_broadcast_loop<op, ScalarType, VectorType>, - &elementwise_logic_op_loop<op, ScalarType, VectorType>); -} - -std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( - const ITensor *input1, const ITensor *input2, ITensor *output, - std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) -{ - std::string function_to_call("op_"); - function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; - function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; - function_to_call += string_from_data_type(output->info()->data_type()); - - auto it = map_function.find(function_to_call); - - if (it != map_function.end()) - { - auto func = it->second; - return [func](const ITensor *input1, const ITensor *input2, ITensor *output, - const Window &window) { func(input1, input2, output, window); }; - } - return nullptr; -} - -template <BinaryLogicalOperation op> -std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> -configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) -{ - static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { - {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, - {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; - - return configure_func(input1, input2, output, map_function); -} - -void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const ITensor *input1, - const ITensor *input2, ITensor *output) -{ - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); - configure_common(input1, input2, output); - switch (op) - { - case BinaryLogicalOperation::AND: - _function = configure_logic_func<BinaryLogicalOperation::AND>(input1, input2, output); - break; - case BinaryLogicalOperation::OR: - _function = configure_logic_func<BinaryLogicalOperation::OR>(input1, input2, output); - break; - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); - } -} - -Status NEBinaryLogicalOperationKernel::validate_arguments(const ITensorInfo &input1, - const ITensorInfo &input2, - const ITensorInfo &output) -{ - // Validate in case of configured output - if (output.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, - DataType::QASYMM8); - } - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2); - - const TensorShape out_shape = - TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, - "Inputs are not broadcast compatible"); - - // Validate in case of configured output - if (output.total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), - "Wrong shape for output"); - } - - return Status{}; -} - -Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, - const ITensorInfo *input1, - const ITensorInfo *input2, - const ITensorInfo *output) -{ - ARM_COMPUTE_UNUSED(op); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); - return Status{}; -} - -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp new file mode 100644 index 000000000..87e716b4f --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" + +#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "src/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "support/SaturateCast.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + +#include "src/core/NEON/INEKernel.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, + DataType::S16, DataType::U16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + } + + return Status{}; +} +} // namespace + +NECastBoolKernel::NECastBoolKernel() : _input(nullptr), _output(nullptr) {} + +void NECastBoolKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype + // must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + _input = input; + _output = output; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + ICPPKernel::configure(win); +} + +Status NECastBoolKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + +void NECastBoolKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output); + ARM_COMPUTE_ERROR_ON(_input == _output); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win); + Iterator output(_output, win); + + const uint8_t true_val = 1; + const uint8x8_t mask_bool = vdup_n_u8(true_val); + + switch (_output->info()->data_type()) + { + case DataType::S8: + { + /* Conversion U8 -> S8 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_s8(output_ptr + x, + vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val)))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s16(output_ptr + x, texels.val[0]); + vst1q_s16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + + vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::F32: + { + /* Up-conversion U8 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val); + *(output_ptr + x) = static_cast<float>(in); + } + }, + input, output); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Up-conversion U8 -> F16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const int16x8x2_t texels = { + {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))), + vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}}; + vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::U8: + { + /* Conversion U8 -> S8 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop( + win, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x); + + const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)), + vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}}; + + vst1q_u16(output_ptr + x, texels.val[0]); + vst1q_u16(output_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val); + } + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp index 091d38c56..3ad9ee945 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -47,10 +47,13 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + using namespace arm_compute; NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() - : _input(nullptr), _lookups(nullptr), _output(nullptr) + : _input(nullptr), _lookups(nullptr), _output(nullptr) { } @@ -79,8 +82,8 @@ Status NEEmbeddingLookupKernel::validate(const arm_compute::ITensorInfo *input, { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, lookups); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON(input->num_dimensions() < 2 && input->num_dimensions() > 4); @@ -119,16 +122,17 @@ void NEEmbeddingLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const int32_t lookup = *reinterpret_cast<int32_t *>( - _lookups->ptr_to_element(Coordinates{id[lookup_dim]})); - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const int32_t lookup = + *reinterpret_cast<int32_t *>(_lookups->ptr_to_element(Coordinates{id[lookup_dim]})); + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 000000000..375fa28e5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> +#include <mutex> + +using namespace arm_compute; + +namespace +{ +inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0)); + + return Status{}; +} + +inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, + ITensorInfo *biases) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + bool window_changed = update_window_and_padding( + win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration), + AccessWindowStatic(biases, 0, 0, + ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), + biases->tensor_shape().y())); + + AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration); + + // Set the valid region for the accum tensor + Coordinates coord; + coord.set_num_dimensions(accum->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); + + _biases = biases; + _accum = accum; + + // Configure kernel window + auto win_config = validate_and_configure_window(accum->info(), biases->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, + const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(accum->clone().get(), biases->clone().get()).first); + + return Status{}; +} + +std::mutex m; +void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info) +{ + std::lock_guard<std::mutex> lock_guard(m); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_biases; + win_biases.set(Window::DimX, + Window::Dimension(window.x().start(), window.x().end(), window.x().step())); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in0_out(_accum, window); + Iterator in1(_biases, win_biases); + + switch (_accum->info()->data_type()) + { + case DataType::F32: + { + execute_window_loop( + window, + [&](const Coordinates &) { + const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr())); + const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr())); + const float32x4x4_t res = { + {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]), + vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}}; + + vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res); + }, + in0_out, in1); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + execute_window_loop( + window, + [&](const Coordinates &) { + const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr())); + const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr())); + const float16x8x2_t res = { + {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}}; + + vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res); + }, + in0_out, in1); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp index 4c0a5e799..d4144e6b9 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -40,7 +40,7 @@ #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -50,6 +50,9 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + namespace arm_compute { namespace @@ -70,7 +73,10 @@ template <typename U> void validate_indices(const ITensor *indices) } // namespace -NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {} +NEGatherKernelEx::NEGatherKernelEx() + : _input{}, _indices{}, _axis{}, _indices_rank{}, _output{}, _func{} +{ +} template <typename U> inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info) @@ -82,36 +88,35 @@ inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadIn Iterator output_it(_output, window); execute_window_loop( - window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices->info()->num_dimensions(), 0); - - U new_index; - switch (_indices->info()->num_dimensions()) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); - break; - case 2: - new_index = - *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); - break; - case 3: - new_index = *( - reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(0, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0])))); + break; + case 2: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1])))); + break; + case 3: + new_index = + *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(0, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(), + output_it.ptr()); + }, + output_it); } template <typename U> @@ -127,37 +132,36 @@ void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &inf Iterator output_it(_output, output_window); execute_window_loop( - output_window, - [&](const Coordinates &id) { - Coordinates gather_id(id); - gather_id.collapse(_indices->info()->num_dimensions(), _axis); - - U new_index; - switch (_indices->info()->num_dimensions()) - { - case 1: - new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); - break; - case 2: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); - break; - case 3: - new_index = *(reinterpret_cast<U *>( - _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); - break; - default: - ARM_COMPUTE_ERROR("Wrong num of dimensions"); - break; - } - - gather_id.set(_axis, new_index); - - std::copy_n(_input->ptr_to_element(gather_id), - _input->info()->dimension(0) * _output->info()->element_size(), - output_it.ptr()); - }, - output_it); + output_window, + [&](const Coordinates &id) { + Coordinates gather_id(id); + gather_id.collapse(_indices_rank, _axis); + + U new_index; + switch (_indices_rank) + { + case 1: + new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis])))); + break; + case 2: + new_index = *( + reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1])))); + break; + case 3: + new_index = *(reinterpret_cast<U *>( + _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2])))); + break; + default: + ARM_COMPUTE_ERROR("Wrong num of dimensions"); + break; + } + + gather_id.set(_axis, new_index); + + std::copy_n(_input->ptr_to_element(gather_id), + _input->info()->dimension(0) * _output->info()->element_size(), output_it.ptr()); + }, + output_it); } void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, @@ -167,13 +171,14 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); _input = input; _indices = indices; _output = output; _axis = axis; + _indices_rank = indices->info()->num_dimensions(); if (_axis < 0) { @@ -213,7 +218,7 @@ void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, I } // Output auto initialization if not yet initialized TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); // Create window @@ -239,15 +244,15 @@ Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions())); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex( - input->tensor_shape(), indices->tensor_shape(), axis); + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp index 30787c0a4..f178865b7 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -47,6 +47,9 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <unordered_map> using namespace arm_compute; @@ -57,7 +60,7 @@ constexpr size_t NOT_HIT = 0xFFFFFFFF; } // namespace NEHashtableLookupKernel::NEHashtableLookupKernel() - : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} + : _lookups(nullptr), _keys(nullptr), _input(nullptr), _output(nullptr), _hits{nullptr} { } @@ -66,7 +69,7 @@ void NEHashtableLookupKernel::configure(const ITensor *lookups, const ITensor *k { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_THROW_ON( - validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); + validate(lookups->info(), keys->info(), input->info(), output->info(), hits->info())); _lookups = lookups; _keys = keys; @@ -92,8 +95,8 @@ Status NEHashtableLookupKernel::validate(const ITensorInfo *lookups, const ITens { ARM_COMPUTE_ERROR_ON_NULLPTR(lookups, keys, input, output, hits); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, - DataType::U32, DataType::S32, DataType::F16, DataType::F32); + input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, + DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lookups, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keys, 1, DataType::S32); @@ -134,8 +137,8 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) const size_t lookup_dim = _output->info()->num_dimensions() - 1; const int const_0 = _output->info()->data_type() == DataType::QASYMM8 - ? _output->info()->quantization_info().uniform().offset - : 0; + ? _output->info()->quantization_info().uniform().offset + : 0; std::unordered_map<int32_t, size_t> key_index_map; for (size_t n = 0; n < _keys->info()->dimension(0); ++n) @@ -174,24 +177,24 @@ void NEHashtableLookupKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, - [&](const Coordinates &id) { - const auto lookup = lookup_indices.at(id[lookup_dim]); - if (lookup == NOT_HIT) - { - memset(output_it.ptr(), const_0, - _output->info()->dimension(0) * _output->info()->element_size()); - } - else - { - Coordinates input_id{id}; - input_id.set(lookup_dim, lookup); - memcpy(output_it.ptr(), _input->ptr_to_element(input_id), - _output->info()->dimension(0) * _output->info()->element_size()); - } - - }, - output_it); + execute_window_loop( + out_slice, + [&](const Coordinates &id) { + const auto lookup = lookup_indices.at(id[lookup_dim]); + if (lookup == NOT_HIT) + { + memset(output_it.ptr(), const_0, + _output->info()->dimension(0) * _output->info()->element_size()); + } + else + { + Coordinates input_id{id}; + input_id.set(lookup_dim, lookup); + memcpy(output_it.ptr(), _input->ptr_to_element(input_id), + _output->info()->dimension(0) * _output->info()->element_size()); + } + }, + output_it); } while (window.slide_window_slice_4D(out_slice)); } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp index 49adf1462..7804f9c6a 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -40,17 +40,22 @@ #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <arm_neon.h> namespace arm_compute @@ -63,7 +68,7 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma { /** NEON vector tag type. */ using ExactTagType = - typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; // Clear X/Y dimensions on execution window as we handle the planes manually Window win = window; @@ -73,107 +78,107 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, ITensor *gamma constexpr int window_step_x = 16 / sizeof(T); const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); const auto channel_idx = - get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL); Iterator input_it(input, win); execute_window_loop( - win, - [&](const Coordinates &id) { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<T>(0.f); - auto sum_squares_h_w = static_cast<T>(0.f); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); - vec_sum_squares_h_w = - wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); - } - - auto vec2_sum_h_w = - wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), - wrapper::vgetlow(vec_sum_squares_h_w)); - for (int i = 0; i < window_step_x / 4; ++i) - { - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - } - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - const auto value = *(input_ptr + x); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - auto gamma_val = 1.0f; - if (gamma != nullptr) - { - gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); - } - const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); - auto beta_val = 0.0f; - if (beta != nullptr) - { - beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); - } - const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); - - execute_window_loop( - win_plane, - [&](const Coordinates &) { - auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); - for (; x <= (window.x().end() - window_step_x); x += window_step_x) - { - vec_val = wrapper::vloadq(input_ptr + x); - vec_val = wrapper::vadd( - wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); - wrapper::vstore(output_ptr + x, vec_val); - } - - // Compute left-over elements - for (; x < window.x().end(); ++x) - { - *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; - } - }, - input_plane_it, output_plane_it); - }, - input_it); + win, + [&](const Coordinates &id) { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<T>(0.f); + auto sum_squares_h_w = static_cast<T>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vec_sum_h_w = wrapper::vadd(vec_sum_h_w, vec_input_val); + vec_sum_squares_h_w = + wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val)); + } + + auto vec2_sum_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), + wrapper::vgetlow(vec_sum_squares_h_w)); + for (int i = 0; i < window_step_x / 4; ++i) + { + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + } + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = *(input_ptr + x); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + auto gamma_val = 1.0f; + if (gamma != nullptr) + { + gamma_val = *reinterpret_cast<T *>(gamma->ptr_to_element({id[channel_idx]})); + } + const auto multip_h_w = gamma_val / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{}); + auto beta_val = 0.0f; + if (beta != nullptr) + { + beta_val = *reinterpret_cast<T *>(beta->ptr_to_element({id[channel_idx]})); + } + const auto vec_beta = wrapper::vdup_n(static_cast<T>(beta_val), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + vec_val = wrapper::vloadq(input_ptr + x); + vec_val = wrapper::vadd( + wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta); + wrapper::vstore(output_ptr + x, vec_val); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta_val; + } + }, + input_plane_it, output_plane_it); + }, + input_it); } Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, @@ -199,8 +204,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - gamma->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + gamma->dimension(0), "Gamma's size must be the same as size of input's channel"); } @@ -208,8 +213,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(get_data_layout_dimension_index( - input->data_layout(), DataLayoutDimension::CHANNEL)) != - beta->dimension(0), + input->data_layout(), DataLayoutDimension::CHANNEL)) != + beta->dimension(0), "Beta's size must be the same as size of input's channel"); } @@ -234,8 +239,8 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe } // namespace NEInstanceNormalizationLayerKernelEx::NEInstanceNormalizationLayerKernelEx() - : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), - _epsilon(1e-12) + : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(nullptr), _beta(nullptr), + _epsilon(1e-12) { } @@ -251,7 +256,7 @@ void NEInstanceNormalizationLayerKernelEx::configure(ITensor *input, ITensor *ou _epsilon = epsilon; ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); + validate_arguments(_input->info(), _output->info(), gamma->info(), beta->info(), epsilon)); if (_input->info()->data_type() == DataType::F32) { @@ -282,7 +287,7 @@ Status NEInstanceNormalizationLayerKernelEx::validate(const ITensorInfo *input, { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon)); ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( - input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp index b92130cec..8ad998313 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -42,13 +42,15 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> @@ -123,15 +125,17 @@ inline float32x4x4_t multiply_scale_vec(const int32x4x4_t &iv, float scale) const float32x4_t vscale = vdupq_n_f32(scale); const float32x4x4_t ret = {{ - vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), - vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[0]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[1]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[2]), vscale), + vmulq_f32(vcvtq_f32_s32(iv.val[3]), vscale), }}; return ret; } } // namespace NEMultiplyScaleFactorKernel::NEMultiplyScaleFactorKernel() - : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) + : _input(nullptr), _scale_factor(nullptr), _output(nullptr), _multiplier(1.f) { } @@ -140,7 +144,7 @@ void NEMultiplyScaleFactorKernel::configure(const ITensor *input, const ITensor { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), scale_factor->info(), output->info())); + validate_arguments(input->info(), scale_factor->info(), output->info())); _input = input; _scale_factor = scale_factor; @@ -180,25 +184,25 @@ template <typename T> void NEMultiplyScaleFactorKernel::multiply(const Window &w Iterator output(_output, win_collapsed); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); - scale *= _multiplier; - - const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); - auto output_ptr = reinterpret_cast<T *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - output_ptr[x] = input_ptr[x] * scale; - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + auto scale = *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})); + scale *= _multiplier; + + const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + store_result<float>(&output_ptr[x], multiply_scale_vec(load_value(&input_ptr[x]), scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = input_ptr[x] * scale; + } + }, + input, output); } void NEMultiplyScaleFactorKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp new file mode 100644 index 000000000..04eb407e9 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" +#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + +namespace arm_compute +{ +namespace +{ +/** Validate the depth + * + * Validate that depth are not negative + * + * @param[in] depth Depth tensor. + * @param[in] output Output tensor. + * @param[in] axis Axis of depth. + */ +template <typename U> void validate_depth(const ITensor *depth, const ITensor *output, int axis) +{ + ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(depth->buffer())) < 0); + ARM_COMPUTE_ERROR_ON(static_cast<U>(output->info()->tensor_shape()[axis]) != + *(reinterpret_cast<U *>(depth->buffer()))); +} + +Status validate_arguments(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + const int actual_axis = wrap_around(axis, static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->tensor_shape().total_size() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(0 > actual_axis || + actual_axis >= static_cast<int>(output->num_dimensions())); + ARM_COMPUTE_RETURN_ERROR_ON(on_value->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(on_value, 1, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, off_value); + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(on_value, output); + } + + return Status{}; +} + +template <typename U, typename Enable = void> bool isOnValue(U) { return true; } + +template <typename U, std::enable_if_t<std::is_integral<U>::value, int> = 0> +bool isOnValue(U index, U depth) +{ + return index >= 0 && index < depth; +} +} // namespace + +NEOneHotKernel::NEOneHotKernel() + : _indices{nullptr}, _depth{nullptr}, _on_value{nullptr}, _off_value{nullptr}, _axis{-1}, + _output{nullptr}, _func{} +{ +} + +template <typename U> +void NEOneHotKernel::onehot_0_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the depth are not negative + validate_depth<U>(_depth, _output, _axis); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator output_it(_output, output_window); + const U off_value = *reinterpret_cast<U *>(_off_value->buffer()); + execute_window_loop( + output_window, + [&](const Coordinates &id) { + std::fill_n(output_it.ptr(), _output->info()->dimension(0) * _output->info()->element_size(), + off_value); + Coordinates indices_id(id); + indices_id.remove(0); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(0, new_index); + std::copy_n(_on_value->buffer(), _output->info()->element_size(), + _output->ptr_to_element(onehot_id)); + } + }, + output_it); +} + +template <typename U> +inline void NEOneHotKernel::onehot_n_axis(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + // Validate that the indices are not negative + validate_depth<U>(_depth, _output, _axis); + Iterator output_it(_output, window); + execute_window_loop( + window, + [&](const Coordinates &id) { + Coordinates indices_id(id); + indices_id.remove(_axis); + const U new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(indices_id))); + if (isOnValue(new_index, *(reinterpret_cast<U *>(_depth->buffer())))) + { + Coordinates onehot_id(id); + onehot_id.set(_axis, new_index); + std::copy_n(static_cast<U>(id[_axis]) == new_index ? _on_value->buffer() + : _off_value->buffer(), + _output->info()->element_size(), output_it.ptr()); + } + }, + output_it); +} + +void NEOneHotKernel::configure(const ITensor *indices, const ITensor *depth, + const ITensor *on_value, const ITensor *off_value, ITensor *output, + int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(indices, depth, on_value, off_value, output); + ARM_COMPUTE_ERROR_ON(output->info()->total_size() == 0); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(indices->info(), depth->info(), on_value->info(), + off_value->info(), output->info(), axis)); + _indices = indices; + _depth = depth; + _on_value = on_value; + _off_value = off_value; + _output = output; + _axis = wrap_around(axis, static_cast<int>(output->info()->num_dimensions())); + if (0 == _axis) + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_0_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_0_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + else + { + switch (_indices->info()->data_type()) + { + case DataType::U32: + _func = &NEOneHotKernel::onehot_n_axis<uint32_t>; + break; + case DataType::S32: + _func = &NEOneHotKernel::onehot_n_axis<int32_t>; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + // Create window + Window win = calculate_max_window(*output->info(), Steps()); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + INEKernel::configure(win); +} + +Status NEOneHotKernel::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(indices, depth, on_value, off_value, output, axis)); + return Status{}; +} + +void NEOneHotKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + (this->*_func)(window, info); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp index 5841f1d69..420e5063c 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -42,13 +42,16 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> @@ -107,19 +110,15 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, const int32x4x4_t rf = {{ #ifdef __aarch64__ - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, - vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtnq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #else //__aarch64__ - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), - vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[0], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[1], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[2], vinvscale))))), + vminq_s32(vposend, vmaxq_s32(vnagend, vcvtq_s32_f32(round(vmulq_f32(fv.val[3], vinvscale))))), #endif //__aarch64__ }}; const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); @@ -129,7 +128,7 @@ inline int8x16_t vquantizeSymm(const float32x4x4_t &fv, float scale_factor_inv, } // namespace NEQuantizationSymmetricKernel::NEQuantizationSymmetricKernel() - : _input(nullptr), _output(nullptr), _scale_factor(nullptr) + : _input(nullptr), _output(nullptr), _scale_factor(nullptr) { } @@ -138,7 +137,7 @@ void NEQuantizationSymmetricKernel::configure(const ITensor *input, ITensor *out { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON( - validate_arguments(input->info(), output->info(), scale_factor->info())); + validate_arguments(input->info(), output->info(), scale_factor->info())); _input = input; _output = output; @@ -182,40 +181,40 @@ template <typename T> void NEQuantizationSymmetricKernel::quantize(const Window const auto dim_x = _input->info()->dimension(0); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); execute_window_loop( - win_collapsed, - [&](const Coordinates &id) { - const auto start = reinterpret_cast<const T *>(input.ptr()); - const auto min_max = std::minmax_element(start, start + dim_x); - const auto int8_scale = 127; - auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); - if (range == 0) - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; - range = 1; - } - else - { - *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; - } - const auto scale_factor_inv = int8_scale / range; - - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - int x = window_start_x; - for (; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], - vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); - } - // Compute left-over elements - for (; x < window_end_x; ++x) - { - int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); - quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); - output_ptr[x] = static_cast<int8_t>(quantized); - } - }, - input, output); + win_collapsed, + [&](const Coordinates &id) { + const auto start = reinterpret_cast<const T *>(input.ptr()); + const auto min_max = std::minmax_element(start, start + dim_x); + const auto int8_scale = 127; + auto range = std::max(std::abs(*min_max.first), std::abs(*min_max.second)); + if (range == 0) + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = 1; + range = 1; + } + else + { + *reinterpret_cast<T *>(_scale_factor->ptr_to_element({id.y()})) = range / int8_scale; + } + const auto scale_factor_inv = int8_scale / range; + + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], + vquantizeSymm(load_value(&input_ptr[x]), scale_factor_inv, int8_scale)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int quantized = arm_compute::round(input_ptr[x] * scale_factor_inv, rounding_policy); + quantized = std::min(int8_scale, std::max(quantized, -int8_scale)); + output_ptr[x] = static_cast<int8_t>(quantized); + } + }, + input, output); } void NEQuantizationSymmetricKernel::run(const Window &window, const ThreadInfo &info) diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp deleted file mode 100644 index 3b65eac10..000000000 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEReductionOperationKernelEx.cpp +++ /dev/null @@ -1,693 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/core/NEON/kernels/NEReductionOperationKernelEx.h" - -#include "arm_compute/core/CPP/Validate.h" -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "arm_compute/core/NEON/wrapper/wrapper.h" -#include <arm_neon.h> - -namespace arm_compute -{ -namespace -{ -// Helper function to calculate the minimum value of the input vector. All the elements in the -// output vector contain the min value. -float32x2_t calculate_min(float32x4_t in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmin(pmin, pmin); -} - -// Helper function to calculate the maximum value of the input vector. All the elements in the -// output vector contain the max value. -float32x2_t calculate_max(float32x4_t in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmax(pmax, pmax); -} -// Helper function to calculate the minimum value of the input vector. All the elements in the -// output vector contain the min value. -int32x2_t calculate_min(int32x4_t in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmin(pmin, pmin); -} - -// Helper function to calculate the maximum value of the input vector. All the elements in the -// output vector contain the max value. -int32x2_t calculate_max(int32x4_t in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - return wrapper::vpmax(pmax, pmax); -} - -// Helper function to calculate the minimum value of the input vector. All the elements in the -// output vector contain the min value. -inline uint8x8_t calculate_min(uint8x16_t in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmin = wrapper::vpmin(pmin, pmin); - pmin = wrapper::vpmin(pmin, pmin); - return wrapper::vpmin(pmin, pmin); -} -// Helper function to calculate the maximum value of the input vector. All the elements in the -// output vector contain the max value. -inline uint8x8_t calculate_max(uint8x16_t in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmax = wrapper::vpmax(pmax, pmax); - pmax = wrapper::vpmax(pmax, pmax); - return wrapper::vpmax(pmax, pmax); -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -// Helper function to calculate the minimum value of the input vector. All the elements in the -// output vector contain the min value. -inline float16x4_t calculate_min(float16x8_t in) -{ - auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmin = wrapper::vpmin(pmin, pmin); - return wrapper::vpmin(pmin, pmin); -} -// Helper function to calculate the maximum value of the input vector. All the elements in the -// output vector contain the max value. -inline float16x4_t calculate_max(float16x8_t in) -{ - auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); - pmax = wrapper::vpmax(pmax, pmax); - return wrapper::vpmax(pmax, pmax); -} -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - -template <class F> class Reducer -{ -public: - static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, - const ReduceOperation op) - { - // Set out window - Window out_window(window); - out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); - - // Get first input and output slices - Window in_slice = window.first_slice_window_1D(); - Window out_slice = out_window.first_slice_window_1D(); - - do - { - Iterator in(input, in_slice); - Iterator out(output, out_slice); - - f(in, out, in_slice, out_slice, *input->info(), op); - } while (window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice)); - } - static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, - const ReduceOperation op) - { - // Set in window - Window in_window(window); - Window out_window(window); - - in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); - out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), - output->info()->dimension(1))); - - // Get first input and output slices - Window in_slice = in_window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); - - do - { - Iterator in(input, in_slice); - Iterator out(output, out_slice); - - f(in, out, in_slice, out_slice, *input->info(), 1, op); - } while (in_window.slide_window_slice_2D(in_slice) && - out_window.slide_window_slice_2D(out_slice)); - } - static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, - const ReduceOperation op) - { - // Set in window - Window in_window(window); - Window out_window(window); - - in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); - out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), - output->info()->dimension(2))); - - // Get first input and output slices - Window in_slice = in_window.first_slice_window_3D(); - Window out_slice = out_window.first_slice_window_3D(); - - do - { - Iterator in(input, in_slice); - Iterator out(output, out_slice); - - f(in, out, in_slice, out_slice, *input->info(), 2, op); - } while (in_window.slide_window_slice_3D(in_slice) && - out_window.slide_window_slice_3D(out_slice)); - } - static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, - const ReduceOperation op) - { - // Set in/out window - Window in_window(window); - Window out_window(window); - - in_window.set(3, Window::Dimension(0, 1, 1)); - out_window.set(3, Window::Dimension(0, 1, 1)); - - // Get first input and output slices - Window in_slice = in_window.first_slice_window_4D(); - Window out_slice = out_window.first_slice_window_4D(); - - do - { - Iterator in(input, in_slice); - Iterator out(output, out_slice); - - f(in, out, in_slice, out_slice, *input->info(), 3, op); - } while (in_window.slide_window_slice_4D(in_slice) && - out_window.slide_window_slice_4D(out_slice)); - } -}; - -template <typename T, int S> struct RedOpX -{ - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, - const TensorInfo &in_info, const ReduceOperation op) - { - ARM_COMPUTE_UNUSED(out_slice); - ARM_COMPUTE_UNUSED(in_info); - auto init_res_value = static_cast<T>(0.f); - switch (op) - { - case ReduceOperation::MIN: - case ReduceOperation::MAX: - { - init_res_value = *reinterpret_cast<T *>(input.ptr()); - break; - } - default: - break; - } - auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); - - execute_window_loop(in_slice, - [&](const Coordinates &) { - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto vec_elements = wrapper::vloadq(in_ptr); - - switch (op) - { - case ReduceOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReduceOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input); - - switch (op) - { - case ReduceOperation::MIN: - { - *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_min(vec_res_value), 0); - break; - } - case ReduceOperation::MAX: - { - *(reinterpret_cast<T *>(output.ptr())) = wrapper::vgetlane(calculate_max(vec_res_value), 0); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } -}; - -struct RedOpX_qasymm8 -{ - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, - const TensorInfo &in_info, const ReduceOperation op) - { - ARM_COMPUTE_UNUSED(out_slice); - ARM_COMPUTE_UNUSED(in_info); - - uint8x16_t vec_res_value = {0}; - - if (op == ReduceOperation::MIN || op == ReduceOperation::MAX) - { - vec_res_value = wrapper::vdup_n(*input.ptr(), wrapper::traits::vector_128_tag{}); - } - - execute_window_loop(in_slice, - [&](const Coordinates &) { - const auto vec_elements = wrapper::vloadq(input.ptr()); - switch (op) - { - case ReduceOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReduceOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input); - - switch (op) - { - case ReduceOperation::MIN: - { - *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - break; - } - case ReduceOperation::MAX: - { - *(output.ptr()) = static_cast<uint8_t>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - break; - } - default: - { - ARM_COMPUTE_ERROR("Not supported"); - } - } - } -}; - -template <typename T, int S> struct RedOpYZW -{ - /** NEON vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; - using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; - - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, - const TensorInfo &in_info, int axis, const ReduceOperation op) - { - ARM_COMPUTE_UNUSED(out_slice); - - execute_window_loop( - in_slice, - [&](const Coordinates &) { - neon_vector vec_res_value = {0}; - switch (op) - { - case ReduceOperation::MIN: - case ReduceOperation::MAX: - { - vec_res_value = wrapper::vloadq(reinterpret_cast<T *>(input.ptr())); - break; - } - default: - { - vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); - break; - } - } - - for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - T *in_ptr; - switch (axis) - { - case 1: - in_ptr = reinterpret_cast<T *>( - input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim))); - break; - case 2: - in_ptr = reinterpret_cast<T *>( - input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim))); - break; - case 3: - in_ptr = reinterpret_cast<T *>( - input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim))); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - const auto vec_elements = wrapper::vloadq(in_ptr); - - switch (op) - { - case ReduceOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReduceOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_res_value); - }, - input, output); - } -}; - -struct RedOpYZW_qasymm8 -{ - inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, - const TensorInfo &in_info, int axis, const ReduceOperation op) - { - ARM_COMPUTE_UNUSED(out_slice); - - execute_window_loop( - in_slice, - [&](const Coordinates &) { - auto vec_res_value = wrapper::vloadq(input.ptr()); - - for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) - { - uint8_t *in_ptr; - switch (axis) - { - case 1: - in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim)); - break; - case 2: - in_ptr = - input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim)); - break; - case 3: - in_ptr = - input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim)); - break; - default: - ARM_COMPUTE_ERROR("Not supported"); - } - const auto vec_elements = wrapper::vloadq(in_ptr); - - switch (op) - { - case ReduceOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } - case ReduceOperation::MAX: - { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - wrapper::vstore(reinterpret_cast<uint8_t *>(output.ptr()), vec_res_value); - }, - input, output); - } -}; - -void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, - const ReduceOperation op) -{ - const bool is_complex = (input->info()->num_channels() == 2); - if (is_complex) - { - ARM_COMPUTE_ERROR("Not supported"); - } - - switch (axis) - { - case 0: - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer<RedOpX_qasymm8>::reduceX(window, input, output, RedOpX_qasymm8(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, - RedOpX<float16_t, 8>(), op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); - case DataType::S32: - return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), - op); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 1: - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer<RedOpYZW_qasymm8>::reduceY(window, input, output, RedOpYZW_qasymm8(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, - RedOpYZW<float16_t, 8>(), op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), - op); - case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, - RedOpYZW<int32_t, 4>(), op); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 2: - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer<RedOpYZW_qasymm8>::reduceZ(window, input, output, RedOpYZW_qasymm8(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, - RedOpYZW<float16_t, 8>(), op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), - op); - case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, - RedOpYZW<int32_t, 4>(), op); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - case 3: - switch (input->info()->data_type()) - { - case DataType::QASYMM8: - return Reducer<RedOpYZW_qasymm8>::reduceW(window, input, output, RedOpYZW_qasymm8(), op); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, - RedOpYZW<float16_t, 8>(), op); -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F32: - return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), - op); - case DataType::S32: - return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, - RedOpYZW<int32_t, 4>(), op); - default: - ARM_COMPUTE_ERROR("Not supported"); - } - default: - ARM_COMPUTE_ERROR("Unsupported reduction axis"); - } -} - -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, - ReduceOperation op) -{ - ARM_COMPUTE_UNUSED(op); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - - if (input->num_channels() == 1) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, - DataType::F16, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("Not support complex"); - } - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, - "Reduction axis greater than max number of dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - - if (output->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); - - const TensorShape output_shape = - arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); - const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); - } - - return Status{}; -} - -std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, - unsigned int axis, ReduceOperation op) -{ - ARM_COMPUTE_UNUSED(op); - - // Calculate output shape and set if empty - const TensorShape output_shape = - arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); - - // Output auto initialization if not yet initialized - DataType output_data_type = input->data_type(); - auto_init_if_empty(*output, input->clone() - ->set_tensor_shape(output_shape) - .set_data_type(output_data_type) - .reset_padding() - .set_is_resizable(true)); - - unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) - ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") - : Status{}; - - return std::make_tuple(err, win); -} -} // namespace - -NEReductionOperationKernelEx::NEReductionOperationKernelEx() - : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReduceOperation::MAX), - _border_size() -{ -} - -BorderSize NEReductionOperationKernelEx::border_size() const { return _border_size; } - -void NEReductionOperationKernelEx::configure(const ITensor *input, ITensor *output, - unsigned int axis, ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op)); - - unsigned int num_elems_processed_per_iteration = - 16 / data_size_from_type(input->info()->data_type()); - - _input = input; - _output = output; - _border_size = - (axis == 0) - ? BorderSize(0, num_elems_processed_per_iteration - - (input->info()->dimension(0) % num_elems_processed_per_iteration), - 0, 0) - : BorderSize(); - _op = op; - _reduction_axis = axis; - - // Configure kernel window - auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis, op); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - INEKernel::configure(std::get<1>(win_config)); -} - -Status NEReductionOperationKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, - unsigned int axis, ReduceOperation op) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( - validate_and_configure_window(input->clone().get(), output->clone().get(), axis, op))); - - return Status{}; -} - -void NEReductionOperationKernelEx::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - reduce_op(window, _input, _output, _reduction_axis, _op); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp new file mode 100644 index 000000000..6b9b0d4b4 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/runtime/Utils.h" + +namespace arm_compute +{ +CLArgMinMaxLayerEx::CLArgMinMaxLayerEx(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), + _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis() +{ +} + +Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && + op != ReductionOperation::ARG_IDX_MIN, + "Invalid reduction operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), + "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); + const unsigned int num_of_stages = + utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + + DataType output_data_type = DataType::S32; + TensorInfo not_reshaped_output; + const auto input_num_channles = input->num_channels(); + const auto input_qinfo = input->quantization_info(); + + if (output->total_size() != 0) + { + output_data_type = output->data_type(); + const TensorInfo expected_output_shape = + output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape( + input->tensor_shape(), axis, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); + } + + auto shape_before_reshape = input->tensor_shape(); + shape_before_reshape.set(axis, 1); + auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, + int num_channels, QuantizationInfo qinfo) { + ti.set_data_type(data_type) + .set_tensor_shape(shape) + .set_num_channels(num_channels) + .set_quantization_info(qinfo); + }; + + initialize_tensorinfo(not_reshaped_output, shape_before_reshape, output_data_type, + input_num_channles, input_qinfo); + + if (num_of_stages == 1) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, ¬_reshaped_output, axis, op)); + } + else + { + // Create temporary tensor infos + std::vector<TensorInfo> sums_vector(num_of_stages - 1); + + // Create intermediate tensor info + TensorShape shape{input->tensor_shape()}; + + for (unsigned int i = 0; i < num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + sums_vector[i].set_data_type(input->data_type()); + sums_vector[i].set_tensor_shape(shape); + sums_vector[i].set_num_channels(input->num_channels()); + } + + // Validate ReductionOperation only on first kernel + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, nullptr, &sums_vector[0], axis, op)); + + // Validate ReductionOperation on intermediate stages + for (unsigned int i = 1; i < num_of_stages - 1; ++i) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLArgMinMaxLayerKernelEx::validate(input, &sums_vector[i - 1], &sums_vector[i], axis, op)); + } + + // Validate ReductionOperation on the last stage + const unsigned int last_stage = num_of_stages - 1; + ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( + input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); + } + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(¬_reshaped_output, output)); + return Status{}; +} + +void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *output, + const ReductionOperation &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); + _reduction_axis = axis; + + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( + input->info()->tensor_shape(), axis, false); + DataType output_data_type = (output->info()->data_type() == DataType::UNKNOWN) + ? DataType::S32 + : output->info()->data_type(); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + + // Configure reduction operation kernels + _reduction_kernels_vector.resize(_num_of_stages); + + _memory_group.manage(&_not_reshaped_output); + // Create temporary tensors + if (_num_of_stages == 1) + { + // Force an early initialization for int64 output type + TensorShape output_shape{input->info()->tensor_shape()}; + output_shape.set(axis, 1); + auto_init_if_empty(*_not_reshaped_output.info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); + _not_reshaped_output.info()->set_tensor_shape(output_shape); + _reduction_kernels_vector[0].configure(input, nullptr, &_not_reshaped_output, axis, op); + } + else + { + _results_vector.resize(_num_of_stages - 1); + TensorShape shape{input->info()->tensor_shape()}; + for (unsigned int i = 0; i < _num_of_stages - 1; i++) + { + shape.set(0, ceil(shape.x() / 128.f)); + _results_vector[i].allocator()->init( + input->info()->clone()->set_tensor_shape(shape).set_data_type(output_data_type)); + } + + // Apply ReductionOperation only on first kernel + _memory_group.manage(&_results_vector[0]); + _reduction_kernels_vector[0].configure(input, nullptr, &_results_vector[0], axis, op); + + // Apply ReductionOperation on intermediate stages + for (unsigned int i = 1; i < _num_of_stages - 1; ++i) + { + _memory_group.manage(&_results_vector[i]); + _reduction_kernels_vector[i].configure(input, &_results_vector[i - 1], &_results_vector[i], + axis, op); + _results_vector[i - 1].allocator()->allocate(); + } + + // Apply ReductionOperation on the last stage + const unsigned int last_stage = _num_of_stages - 1; + _reduction_kernels_vector[last_stage].configure(input, &_results_vector[last_stage - 1], + &_not_reshaped_output, axis, op); + _results_vector[last_stage - 1].allocator()->allocate(); + } + _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output, + output); + _not_reshaped_output.allocator()->allocate(); +} + +void CLArgMinMaxLayerEx::run() +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + for (unsigned int i = 0; i < _num_of_stages; ++i) + { + CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); + } + _reshape_kernel.run(); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp index e5122ab8f..31c96b080 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -42,13 +42,14 @@ #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, BinaryLogicalOperation op) { - auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + auto k = std::make_unique<CLBinaryLogicalOpKernel>(); k->configure(input1, input2, output, op); _kernel = std::move(k); @@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp index 768c15b41..96f9c17a9 100644 --- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2018-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -38,17 +38,15 @@ * SOFTWARE. */ -#include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h" +#include "arm_compute/runtime/CL/functions/CLCastBool.h" -#include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h" -#include "support/MemorySupport.h" +#include "arm_compute/core/CL/kernels/CLCastBoolKernel.h" using namespace arm_compute; -void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, - const ITensor *off_value, ITensor *output, const int axis) +void CLCastBool::configure(ICLTensor *input, ICLTensor *output) { - auto k = support::cpp14::make_unique<CPPOneHotKernelEx>(); - k->configure(indices, depth, on_value, off_value, output, axis); + auto k = std::make_unique<CLCastBoolKernel>(); + k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp index 3dede0562..464f60dee 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -45,6 +45,8 @@ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <memory> #include <tuple> @@ -53,16 +55,10 @@ namespace arm_compute using namespace arm_compute::misc::shape_calculator; CLDirectTransposeConvLayer::CLDirectTransposeConvLayer( - std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _scale_f(), - _conv_f(), - _flip_weights(), - _scaled_output(), - _original_weights(nullptr), - _weights_flipped(), - _flip_axis(), - _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) // NOLINT + : _memory_group(std::move(memory_manager)), _scale_f(), _conv_f(), _flip_weights(), + _scaled_output(), _original_weights(nullptr), _weights_flipped(), _flip_axis(), + _is_prepared(false) { } @@ -74,7 +70,7 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( - input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); const DataLayout data_layout = input->data_layout(); @@ -86,8 +82,8 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), - weights->dimension(idx_h), info, invalid_right, invalid_bottom); + input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), + weights->dimension(idx_h), info, invalid_right, invalid_bottom); const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights); @@ -117,19 +113,19 @@ Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITen unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(scale_out_shape) - .set_data_layout(data_layout)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(scale_out_shape) + .set_data_layout(data_layout)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info)); - ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, weights_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info)); return Status{}; } @@ -171,22 +167,22 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(idx_w), input->info()->dimension(idx_h), - weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, - invalid_bottom); + input->info()->dimension(idx_w), input->info()->dimension(idx_h), + weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right, + invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); // Output auto initialization if not yet initialized auto_init_if_empty( - *output->info(), - input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); + *output->info(), + input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout)); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate( - input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, + invalid_right, invalid_bottom)); _is_prepared = weights_info.retain_internal_weights(); @@ -195,8 +191,8 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order // to match output shape const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp index ae9d8afc6..003ec8042 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -39,7 +39,6 @@ */ #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" - #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" using namespace arm_compute; @@ -47,7 +46,7 @@ using namespace arm_compute; void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups) { - auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + auto k = std::make_unique<CLEmbeddingLookupKernel>(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index 01989461e..af936e873 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -45,7 +45,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" #include <algorithm> @@ -60,7 +59,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I ARM_COMPUTE_UNUSED(weights); ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) { - auto k = support::cpp14::make_unique<CLTransposeKernel>(); + auto k = std::make_unique<CLTransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } @@ -80,12 +79,12 @@ Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), - _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), - _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), - _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), - _original_weights(nullptr) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), + _mm_gemmlowp(memory_manager), _multiply_scale_kernel(), _accumulate_biases_kernel(), + _reshape_weights_output(), _quantized_input(), _scale_factor(), _gemmlowp_output(), + _are_weights_reshaped(true), _accumulate_biases(false), _is_prepared(false), + _original_weights(nullptr) { } void CLFullyConnectedHybridLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -107,8 +106,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -140,10 +139,10 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen bool is_fc_after_conv = false; if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -158,28 +157,28 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_kernel.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Extract scale factor _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, input->info()->data_type())); _memory_group.manage(&_scale_factor); _scale_factor_kernel.configure(input, &_scale_factor); // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _memory_group.manage(&_quantized_input); _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input); // GEMMLowp _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); _memory_group.manage(&_gemmlowp_output); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output, fc_info.retain_internal_weights); @@ -209,15 +208,15 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe const GPUTarget gpu_target = CLScheduler::get().target(); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); + CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target)); } // With the Fully Connected layer we can have 4 different cases: @@ -247,33 +246,32 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } // Validate Scale factor kernel const ITensorInfo &scale_factor = - TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); + TensorInfo(TensorShape{output->dimension(1)}, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor)); // Validate quantization symm8 kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); ARM_COMPUTE_RETURN_ON_ERROR( - CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); + CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input)); // Fully Connected layer after a Fully Connected Layer without batches ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate matrix multiply kernel const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); // Multiply scale ARM_COMPUTE_RETURN_ON_ERROR( - CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); + CLMultiplyScaleFactorKernel::validate(&gemmlowp_output, &scale_factor, output)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index 2ff4b9659..c6a88d340 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -42,11 +42,11 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + +#include "support/Cast.h" #include <algorithm> @@ -79,7 +79,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn int output_multiplier = 0; int output_shift = 0; ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( - multiplier, &output_multiplier, &output_shift)); + multiplier, &output_multiplier, &output_shift)); // Set the GEMMLowp output stage info gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; @@ -99,7 +99,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I { GEMMLowpOutputStageInfo gemmlowp_output_stage; ARM_COMPUTE_RETURN_ON_ERROR( - construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -125,14 +125,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, - gemm_info)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); } else { ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); + CLGEMM::validate(&input, &weights, bias, &output, 1.f, 1.f, gemm_info)); } return Status{}; @@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) { - auto k = support::cpp14::make_unique<CLTransposeKernel>(); + auto k = std::make_unique<CLTransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } @@ -154,12 +154,12 @@ Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager) - : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), - _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), - _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), - _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), - _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), - _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) + : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), + _convert_weights_managed(), _reshape_weights_managed_function(), _flatten_layer(), + _reshape_weights_function(), _mm_gemm(memory_manager, weights_manager), + _mm_gemmlowp(memory_manager), _flatten_output(), _converted_weights_output(), + _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + _is_fc_after_conv(true), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTensor *weights, @@ -190,9 +190,9 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); @@ -214,8 +214,8 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -223,11 +223,11 @@ void CLFullyConnectedLayerEx::configure_conv_fc(const ICLTensor *input, const IC // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init(input->info() - ->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(shape_flatten) - .set_data_layout(DataLayout::NCHW)); + ->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(shape_flatten) + .set_data_layout(DataLayout::NCHW)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -258,8 +258,8 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -285,10 +285,10 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -302,7 +302,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor { _reshape_weights_managed_function.configure(weights); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_reshape_weights_managed_function)); + _weights_manager->acquire(weights, &_reshape_weights_managed_function)); } else { @@ -320,7 +320,7 @@ void CLFullyConnectedLayerEx::configure(const ICLTensor *input, const ICLTensor _convert_weights_managed.configure(weights_to_use, input->info()->tensor_shape(), fc_info.weights_trained_layout); weights_to_use = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->acquire(weights, &_convert_weights_managed)); + _weights_manager->acquire(weights, &_convert_weights_managed)); } else { @@ -359,16 +359,16 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_fc_after_conv = true; const ITensorInfo &flatten_input = TensorInfo(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(compute_flatten_shape(input)) - .set_data_layout(DataLayout::NCHW)); + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(input)) + .set_data_layout(DataLayout::NCHW)); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -396,7 +396,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -404,7 +404,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(CLConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -412,8 +412,8 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); @@ -427,7 +427,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR( - validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); return Status{}; } @@ -457,7 +457,7 @@ void CLFullyConnectedLayerEx::run() if (_weights_manager && _weights_manager->are_weights_managed(cur_weights)) { _original_weights = utils::cast::polymorphic_downcast<ICLTensor *>( - _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); + _weights_manager->run(cur_weights, &_reshape_weights_managed_function)); } else { diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp index 157b4d977..cda784541 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -19,6 +19,7 @@ #include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h> +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; @@ -41,7 +42,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp // reshape auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( - _input->info()->data_layout())); + _input->info()->data_layout())); _cl_reshape.configure(_input, &_cl_buffer); input_to_use = &_cl_buffer; } @@ -57,7 +58,7 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { bool is_hybrid = (input->info()->data_type() == DataType::F32 || input->info()->data_type() == DataType::F16) && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) @@ -81,7 +82,6 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp { throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type"); } - }(); if (_needs_reshape) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 000000000..cd7409417 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "support/StringSupport.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1); + + return Status{}; +} + +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target, + unsigned int &num_elems_processed_per_iteration) +{ + // Select the vector size to use (8 for Bifrost; 16 for Midgard). + bool is_gpu_bifrost = + gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51, + GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT); + num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access( + biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), + biases->dimension(1)); + AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, biases_access, accum_access); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) +{ + configure(CLKernelLibrary::get().get_compile_context(), accum, biases); +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context, + ICLTensor *accum, const ICLTensor *biases) +{ + ARM_COMPUTE_UNUSED(compile_context); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); + + _biases = biases; + _accum = accum; + + // Get the target gpu + GPUTarget gpu_target = get_target(); + unsigned int vector_size = 0; + + // Configure kernel window + auto win_config = + validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())); + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options())); +} + +Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, + const ITensorInfo *biases, GPUTarget gpu_target) +{ + unsigned int num_elems_processed_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), + biases->clone().get(), gpu_target, + num_elems_processed_per_iteration) + .first); + + return Status{}; +} + +void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window accum_slice = window.first_slice_window_2D(); + + Window biases_slice(accum_slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_2D_tensor_argument(idx, _accum, accum_slice); + add_1D_tensor_argument(idx, _biases, biases_slice); + + enqueue(queue, *this, accum_slice, lws_hint()); + } while (window.slide_window_slice_2D(accum_slice)); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp index e0b833b04..f380e3e2c 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -41,6 +41,8 @@ #include "arm_compute/runtime/CL/functions/CLGatherEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/kernels/CLGatherKernel.h" + #include "arm_compute/core/CL/kernels/CLGatherExKernel.h" using namespace arm_compute; @@ -48,7 +50,7 @@ using namespace arm_compute; void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) { - auto k = support::cpp14::make_unique<CLGatherExKernel>(); + auto k = std::make_unique<CLGatherExKernel>(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp index 65b89a389..9896abd4b 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -47,7 +47,7 @@ using namespace arm_compute; void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { - auto k = support::cpp14::make_unique<CLHashtableLookupKernel>(); + auto k = std::make_unique<CLHashtableLookupKernel>(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp index 5a7e40839..ca45a57f8 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma, ICLTensor *beta, float epsilon) { - auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>(); k->configure(input, output, gamma, beta, epsilon); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp index 28e5bc0da..2bdc451b3 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -46,7 +46,7 @@ using namespace arm_compute; void CLNeg::configure(ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + auto k = std::make_unique<CLNegKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp new file mode 100644 index 000000000..759a19ff3 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLOneHot.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ +CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, + const ICLTensor *off_value, ICLTensor *output, int depth, int axis) +{ + _onehot_kernel.configure(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::configure(const ICLTensor *indices, const ICLTensor *on_value, ICLTensor *output, + PixelValue off_value, int depth, int axis) +{ + _has_to_memset = true; + _memset_kernel.configure(output, off_value); + _onehot_kernel.configure(indices, on_value, output, depth, axis); +} +Status CLOneHot::validate(const ITensorInfo *indices, const ITensorInfo *on_value, + const ITensorInfo *off_value, const ITensorInfo *output, int depth, + int axis) +{ + return CLOneHotKernel::validate(indices, on_value, off_value, output, depth, axis); +} +void CLOneHot::run() +{ + if (_has_to_memset) + { + CLScheduler::get().enqueue(_memset_kernel, true); + } + + CLScheduler::get().enqueue(_onehot_kernel, false); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp new file mode 100644 index 000000000..4d940e966 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h" +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" + +namespace arm_compute +{ +CLPadLayerEx::CLPadLayerEx() + : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()), + _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false) +{ +} + +void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, + mode); +} + +void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input, + ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(input->info(), output->info(), padding, constant_value, mode)); + + _perform_pad = std::any_of(padding.begin(), padding.end(), + [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); + + if (_perform_pad) + { + _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); + } + else + { + Window copy_window = Window(); + copy_window.use_tensor_dimensions(output->info()->tensor_shape()); + // Copy the input to the whole output if no padding is applied + _copy_kernel->configure(compile_context, input->info(), output->info(), ©_window); + } +} +Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { + return info.first > 0 || info.second > 0; + }); + + if (perform_pad) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output)); + } + return Status{}; +} +void CLPadLayerEx::run() +{ + if (_perform_pad) + { + CLScheduler::get().enqueue(*_pad_kernel); + } + else + { + CLScheduler::get().enqueue(*_copy_kernel); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp index b198e7330..6740835a8 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -40,21 +40,20 @@ #include "arm_compute/runtime/CL/functions/CLReduceOperation.h" -#include "arm_compute/core/CL/kernels/CLReduceOperationKernel.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/runtime/CL/CLScheduler.h" using namespace arm_compute; CLReduceOperation::CLReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), - _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() + : _memory_group(std::move(memory_manager)), _input(nullptr), _output(nullptr), _axis(), + _keep_dims(false), _interm_tensors(), _reduce_kernels(), _reshape() { } Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *output, const std::set<uint32_t> &axis, bool keep_dims, - const ReduceOperation &op) + const ReductionOperation &op) { const size_t num_of_kernels = axis.size(); const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0); @@ -62,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1); // Create temporary tensor infos - auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors); // Create intermediate tensor info TensorShape shape{input->tensor_shape()}; @@ -92,13 +91,13 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * for (size_t i = 0; i < num_of_kernels; ++i, ++it) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); + CLReduceOperationKernel::validate(tensors[i], tensors[i + 1], *it, op)); } if (!keep_dims) { ARM_COMPUTE_RETURN_ON_ERROR( - CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); + CLReshapeLayer::validate(&interm_tensors[num_of_interm_tensors - 1], output)); } return Status{}; @@ -106,7 +105,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, const std::set<uint32_t> &axis, bool keep_dims, - ReduceOperation op) + ReductionOperation op) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), axis, keep_dims, op)); @@ -125,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, throw std::runtime_error("CLReduceOperation: there is no axis to reduce"); } - _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); - _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels); // Set a vector that is ordered ICLTensors sequentially. std::vector<ICLTensor *> tensors; @@ -137,7 +136,7 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, } tensors.emplace_back(output); - // Apply ReduceOperation on all kernels + // Apply ReductionOperation on all kernels TensorShape shape{input->info()->tensor_shape()}; auto it = axis.begin(); for (size_t i = 0; i < num_of_kernels; ++i, ++it) diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp new file mode 100644 index 000000000..bca4d5cb6 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLSplitVEx.h" +#include "support/ToolchainSupport.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" +#include <cassert> + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ICLTensor *size_splits, const std::vector<ICLTensor *> &outputs, + unsigned int num_splits) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(size_splits->info()->num_dimensions() != 1, + "size_splits must be a 1-D tensor."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_splits != outputs.size(), + "Number of output tensors does not match number of splits."); + return Status{}; +} + +Status validate_slices(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, + uint32_t split_dim) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON(split_dim >= input->num_dimensions()); + ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2); + + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + unsigned int axis_offset = 0; + // Validate output tensors + for (const auto &output : outputs) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + // Get output shape + const TensorShape output_shape = output->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->clone(); + auto_init_if_empty(tmp_output_info, + input->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords)); + + axis_offset += axis_split_step; + } + + return Status{}; +} + +void configure_slices(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, + std::vector<CLSlice> &_slice_functions, uint32_t split_dim) +{ + unsigned int axis_offset = 0; + // Start/End coordinates + Coordinates start_coords; + Coordinates end_coords; + for (unsigned int d = 0; d < input->info()->num_dimensions(); ++d) + { + end_coords.set(d, -1); + } + int out_iter = 0; + for (const auto &output : outputs) + { + const TensorShape output_shape = output->info()->tensor_shape(); + auto op_size = output_shape.total_size(); + if (!op_size) + { + continue; + } + + assert(op_size != 0); + assert(split_dim <= output_shape.num_dimensions()); + + const size_t axis_split_step = output_shape[split_dim]; + + // Output auto inizialitation if not yet initialized + TensorInfo tmp_output_info = *output->info()->clone(); + auto_init_if_empty( + tmp_output_info, + input->info()->clone()->set_is_resizable(true).set_tensor_shape(output_shape)); + + // Update coordinate on axis + start_coords.set(split_dim, axis_offset); + end_coords.set(split_dim, axis_offset + axis_split_step); + + // Configure slice function + _slice_functions[out_iter].configure(input, output, start_coords, end_coords); + + // Set valid region from shape + outputs[out_iter++]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape)); + axis_offset += axis_split_step; + } +} + +} // namespace + +CLSplitVEx::CLSplitVEx() + : _input(nullptr), _size_splits(nullptr), _outputs(), _num_splits(0), _slice_functions() +{ +} + +void CLSplitVEx::configure(const ICLTensor *input, const ICLTensor *size_splits, uint32_t split_dim, + const std::vector<ICLTensor *> &outputs, unsigned int num_splits) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, size_splits); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(size_splits, outputs, num_splits)); + + _input = input; + _size_splits = size_splits; + _outputs = outputs; + _num_splits = num_splits; + + // Create tensor slices + _slice_functions.resize(_num_splits); + + // Extract output tensor info + std::vector<ITensorInfo *> outputs_info; + for (auto &&output : _outputs) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + outputs_info.emplace_back(output->info()); + } + + // Validate slices + ARM_COMPUTE_ERROR_THROW_ON(validate_slices(_input->info(), outputs_info, split_dim)); + + // Configure slices + configure_slices(_input, _outputs, _slice_functions, split_dim); +} + +void CLSplitVEx::run() +{ + // execute the slices + for (unsigned i = 0; i < _outputs.size(); ++i) + { + _slice_functions[i].run(); + } +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp index 3ac95a8e6..accd51302 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -49,14 +49,14 @@ namespace arm_compute { CLTopKV2::CLTopKV2() - : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), - _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), - _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), - _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), - _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), - _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), - _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), - _reorder_negatives_kernel(), _store_kernel()*/ + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), _p_out_key_buf(nullptr), + _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr) /*, _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel()*/ { } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index 3215d01a7..f3f093c18 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -53,7 +53,7 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_manager(std::move(memory_manager)), _function() + : _memory_manager(std::move(memory_manager)), _function() { } @@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC { case DeconvolutionMethod::DIRECT: { - auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>(); + auto f = std::make_unique<CLDirectTransposeConvLayer>(); f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info); _function = std::move(f); @@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC } case DeconvolutionMethod::GEMM: { - auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); + auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); f->configure(compile_context, input, weights, bias, output, deconv_info); _function = std::move(f); break; @@ -105,20 +105,20 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); switch (CLTransposeConvLayer::get_deconvolution_method( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)) { case DeconvolutionMethod::DIRECT: { // Validate direct convolution layer ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate( - input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); + input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info)); break; } case DeconvolutionMethod::GEMM: { // Validate gemm-based convolution layer ARM_COMPUTE_RETURN_ON_ERROR( - CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); + CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info)); break; } default: @@ -130,9 +130,9 @@ Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf } DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method( - const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, - ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, - unsigned int invalid_bottom, const WeightsInfo &weights_info) + const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, + ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right, + unsigned int invalid_bottom, const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(output, bias, weights_info); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp deleted file mode 100644 index 2fc94b267..000000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2018-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" -#include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> - -#include "arm_compute/core/ITensor.h" -#include "support/MemorySupport.h" - -#include <utility> - -namespace arm_compute -{ - -template <BinaryLogicalOperation COP> -void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, - ITensor *output) -{ - auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); - k->configure(COP, input1, input2, output); - _kernel = std::move(k); -} - -template <BinaryLogicalOperation COP> -Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, - const ITensorInfo *input2, - const ITensorInfo *output) -{ - return NEBinaryLogicalOperationKernel::validate(COP, input1, input2, output); -} - -void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, - BinaryLogicalOperation op) -{ - auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); - k->configure(op, input1, input2, output); - _kernel = std::move(k); -} - -Status NEBinaryLogicalOperation::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *output, BinaryLogicalOperation op) -{ - return NEBinaryLogicalOperationKernel::validate(op, input1, input2, output); -} - -// Supported Specializations -template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::AND>; -template class NEBinaryLogicalOperationStatic<BinaryLogicalOperation::OR>; -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp new file mode 100644 index 000000000..f6eec2603 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2016-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/NEON/functions/NECastBool.h" + +#include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" + +using namespace arm_compute; + +void NECastBool::configure(const ITensor *input, ITensor *output) +{ + auto k = std::make_unique<NECastBoolKernel>(); + k->configure(input, output); + _kernel = std::move(k); +} + +Status NECastBool::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return NECastBoolKernel::validate(input, output); +} diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp index e0ab3e025..99fc5c579 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -41,13 +41,12 @@ #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" -#include "support/MemorySupport.h" using namespace arm_compute; void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) { - auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + auto k = std::make_unique<NEEmbeddingLookupKernel>(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index a123439d9..fbd88fff0 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -58,7 +58,7 @@ namespace Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); + NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output)); return Status{}; } @@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) { - auto k = support::cpp14::make_unique<NETransposeKernel>(); + auto k = std::make_unique<NETransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } @@ -78,11 +78,11 @@ Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *in } NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), - _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), - _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), - _accumulate_biases(false), _is_prepared(false) + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), + _mm_gemmlowp(), _accumulate_biases_kernel(), _reshape_weights_output(), _quantized_input(), + _scale_factor(), _original_weights(nullptr), _are_weights_reshaped(false), + _accumulate_biases(false), _is_prepared(false) { } @@ -103,8 +103,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedHybridLayer::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; @@ -132,10 +132,10 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor bool _is_fc_after_conv; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -150,23 +150,23 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor { // Reshape the weights _reshape_weights_output.allocator()->init( - weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights->info()))); + weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights->info()))); _reshape_weights_function.configure(weights_to_use, &_reshape_weights_output); weights_to_use = &_reshape_weights_output; } // Quantize input _quantized_input.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( + DataType::QASYMM8_SIGNED)); _scale_factor.allocator()->init( - TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); + TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32)); _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor); // GEMM _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); configure_mm(&_quantized_input, weights_to_use, &_gemmlowp_output); // Multiply scale @@ -195,8 +195,8 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr) @@ -217,7 +217,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -225,20 +225,19 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); // Validate quantization kernel - const ITensorInfo &quantized_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::QASYMM8_SIGNED)); + const ITensorInfo &quantized_input = TensorInfo( + input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::QASYMM8_SIGNED)); const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR( - NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); + NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor)); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Validate matrix multiply kernel ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(quantized_input, *weights_to_use, gemmlowp_output)); ARM_COMPUTE_RETURN_ON_ERROR(NEMultiplyScaleFactorKernel::validate( - &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); + &gemmlowp_output, &scale_factor, output, weights->quantization_info().uniform().scale)); return Status{}; } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp index cb7557a5a..758f7dc59 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -50,7 +50,8 @@ #include <algorithm> #include <cmath> -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -69,14 +70,14 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( - &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + &input.clone()->set_quantization_info(input_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate( - &input, &weights, nullptr, &output, 1.f, 0.0f, - GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, + GEMMInfo(false, false, false /* Reshape weights only for the first run */))); } return Status{}; @@ -84,12 +85,12 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } // namespace NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), - _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), - _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), - _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), - _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), - _accumulate_biases(false), _is_quantized(false), _is_prepared(false) + : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), + _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), + _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false), + _is_quantized(false), _is_prepared(false) { } @@ -105,9 +106,9 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor * const QuantizationInfo weights_quantization_info = weights->info()->quantization_info(); input->info()->set_quantization_info(QuantizationInfo( - input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); weights->info()->set_quantization_info(QuantizationInfo( - weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp.configure(input, weights, nullptr, output); @@ -129,8 +130,8 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen ITensor *output) { ARM_COMPUTE_ERROR_ON( - (weights->info()->dimension(1) != - (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + (weights->info()->dimension(1) != + (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be // linearized @@ -138,8 +139,7 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen // Initialize output tensor for flatten TensorShape shape_flatten = compute_flatten_shape(input->info()); _flatten_output.allocator()->init( - input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - shape_flatten)); + input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten)); // Configure flatten kernel _memory_group.manage(&_flatten_output); @@ -165,12 +165,11 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei const ITensor *biases, ITensor *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( - input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - fc_info)); + input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + fc_info)); _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; @@ -183,8 +182,7 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei if (_is_quantized) { _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type( - DataType::S32)); + output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); } // Configure accumulate biases kernel for non quantized asymmetric types @@ -208,10 +206,10 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei const bool is_batched_fc_layer = output->info()->dimension(1) > 1; if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && - (std::equal(input->info()->tensor_shape().cbegin() + 3, - input->info()->tensor_shape().cend(), - output->info()->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = + (TensorShape::num_max_dimensions >= 4) && + (std::equal(input->info()->tensor_shape().cbegin() + 3, input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); } else { @@ -284,16 +282,16 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const ITensorInfo &flatten_input = - TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_flatten_shape(input))); + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); const ITensorInfo &reshaped_weights = - TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( - compute_transposed_shape(*weights))); + TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = - weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) - : TensorInfo(*reshaped_weights.clone()); + weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); // Configure accumulate biases kernel for non quantized asymmetric types if (biases != nullptr && !is_quantized) @@ -330,7 +328,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -338,7 +336,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Validate convert weights kernel ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate( - weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use, &converted_weights, input->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } @@ -346,11 +344,11 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor { // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON( - (weights_to_use->dimension(1) != - (input->dimension(0) * input->dimension(1) * input->dimension(2)))); + (weights_to_use->dimension(1) != + (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input)); input_to_use = &flatten_input; } else @@ -365,7 +363,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor if (is_quantized) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( - &gemmlowp_output, biases, output)); + &gemmlowp_output, biases, output)); } return Status{}; @@ -376,9 +374,13 @@ void NEFullyConnectedLayerEx::run() if (!_is_prepared) { if (!_are_weights_reshaped) + { _reshape_weights_output.allocator()->allocate(); + } if (!_are_weights_converted) + { _converted_weights_output.allocator()->allocate(); + } _is_prepared = true; } @@ -409,7 +411,7 @@ void NEFullyConnectedLayerEx::run() // Linearize input if it comes from a convolutional layer if (_is_fc_after_conv) { - NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + _flatten_kernel.run(); } // Run matrix multiply @@ -492,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare() } #endif } +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index dc6c78478..2199839fb 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -19,6 +19,8 @@ #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> +#include "src/core/helpers/AutoConfiguration.h" +#include <cassert> using namespace arm_compute; @@ -56,7 +58,7 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); bool is_hybrid = input->info()->data_type() == DataType::F32 && - (weights->info()->data_type() == DataType::S8 || + (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); if (is_hybrid) diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp index 433c35d58..e5607ab9a 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -41,7 +41,6 @@ #include "arm_compute/runtime/NEON/functions/NEGatherEx.h" #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" -#include "support/MemorySupport.h" #include <utility> @@ -49,7 +48,7 @@ namespace arm_compute { void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) { - auto k = support::cpp14::make_unique<NEGatherKernelEx>(); + auto k = std::make_unique<NEGatherKernelEx>(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp index 52d58accf..7cc6c89e7 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -41,14 +41,13 @@ #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" -#include "support/MemorySupport.h" using namespace arm_compute; void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, ITensor *hits) { - auto k = support::cpp14::make_unique<NEHashtableLookupKernel>(); + auto k = std::make_unique<NEHashtableLookupKernel>(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp index 16d74e62d..451aa0997 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEInstanceNormalizationLayerEx.cpp @@ -46,9 +46,9 @@ namespace arm_compute { NEInstanceNormalizationLayerEx::NEInstanceNormalizationLayerEx( - std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), - _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), + _permute_input(), _permute_output(), _permuted_input(), _permuted_output() { } @@ -88,8 +88,8 @@ Status NEInstanceNormalizationLayerEx::validate(const ITensorInfo *input, const float epsilon) { return NEInstanceNormalizationLayerKernelEx::validate( - &input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); + &input->clone()->set_data_layout(DataLayout::NCHW), + &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon); } void NEInstanceNormalizationLayerEx::run() diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp index 2752eb6aa..e0620bad2 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp @@ -15,7 +15,7 @@ */ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,30 +37,23 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/runtime/NEON/functions/NEOneHot.h" +#include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayerEx.h" - -#include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h" -#include "arm_compute/runtime/IRuntimeContext.h" -#include "support/MemorySupport.h" - +#include <utility> namespace arm_compute { -NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT - : INESimpleFunctionNoBorder(ctx) +void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, + const ITensor *off_value, ITensor *output, int axis) { -} -void NEActivationLayerEx::configure(ITensor *input, ITensor *output, - ActivationLayerInfo activation_info) -{ - auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>(); - k->configure(input, output, activation_info); + auto k = std::make_unique<NEOneHotKernel>(); + k->configure(indices, depth, on_value, off_value, output, axis); _kernel = std::move(k); } - -Status NEActivationLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, - const ActivationLayerInfo &act_info) +Status NEOneHot::validate(const ITensorInfo *indices, const ITensorInfo *depth, + const ITensorInfo *on_value, const ITensorInfo *off_value, + const ITensorInfo *output, int axis) { - return NEActivationLayerKernelEx::validate(input, output, act_info); + return NEOneHotKernel::validate(indices, depth, on_value, off_value, output, axis); } } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp index aedb537e9..a30c00ea1 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -40,22 +40,24 @@ #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" -#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/Tensor.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; NEReduceOperation::NEReduceOperation(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates &reduction_axis, - bool keep_dims, const ITensorInfo *output, ReduceOperation op) + bool keep_dims, const ITensorInfo *output, ReductionOperation op) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_UNUSED(op); @@ -102,7 +104,7 @@ Status NEReduceOperation::validate(const ITensorInfo *input, const Coordinates & } void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, - ITensor *output, ReduceOperation op) + ITensor *output, ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -125,7 +127,7 @@ void NEReduceOperation::configure(ITensor *input, const Coordinates &reduction_a for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp index 26a887912..7a1342644 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -40,15 +40,19 @@ #include "arm_compute/runtime/NEON/functions/NEReduceSum.h" -#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; NEReduceSum::NEReduceSum(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), - _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), + _reduction_ops(), _keep_dims() { } @@ -122,7 +126,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b for (unsigned int i = 0; i < _reduction_ops; ++i) { TensorShape out_shape = - i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? input : (&_reduced_outs[i - 1]); @@ -135,7 +139,7 @@ void NEReduceSum::configure(ITensor *input, const Coordinates &reduction_axis, b _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()) - .set_data_layout(input->info()->data_layout())); + .set_data_layout(input->info()->data_layout())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::SUM); diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp deleted file mode 100644 index 2aa0d2d4b..000000000 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReductionOperationEx.cpp +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Copyright (c) 2017-2019 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "arm_compute/runtime/NEON/functions/NEReductionOperationEx.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -namespace -{ -/** Define dimension to split the window - * - * @param[in] axis Reduction axis - * - * @return The dimension to split the window - */ -size_t reduction_window_split_dimension(unsigned int axis) -{ - switch (axis) - { - case 0: - return Window::DimY; - case 1: - case 2: - case 3: - return Window::DimX; - default: - ARM_COMPUTE_ERROR("Unsupported reduction axis"); - } -} -} // namespace - -NEReductionOperationEx::NEReductionOperationEx() - : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis() -{ -} - -Status NEReductionOperationEx::validate(const ITensorInfo *input, const ITensorInfo *output, - unsigned int axis, ReduceOperation op) -{ - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernelEx::validate(input, output, axis, op)); - - return Status{}; -} - -void NEReductionOperationEx::configure(ITensor *input, ITensor *output, unsigned int axis, - ReduceOperation op) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON( - NEReductionOperationEx::validate(input->info(), output->info(), axis, op)); - - // Configure reduction kernel - _reduction_kernel.configure(input, output, axis, op); - _window_split = reduction_window_split_dimension(axis); - _reduction_axis = axis; - - if (axis == 0) - { - // Configure fill border kernel - const BorderSize fill_border_size = _reduction_kernel.border_size(); - PixelValue pixelValue; - switch (op) - { - case ReduceOperation::MIN: - { - switch (input->info()->data_type()) - { - case DataType::F32: - { - pixelValue = PixelValue(std::numeric_limits<float>::max()); - break; - } - case DataType::F16: - { - pixelValue = PixelValue(static_cast<half>(65504.0f)); - break; - } - case DataType::QASYMM8: - { - pixelValue = - PixelValue(255, input->info()->data_type(), input->info()->quantization_info()); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported DataType"); - } - } - break; - } - case ReduceOperation::MAX: - { - switch (input->info()->data_type()) - { - case DataType::F32: - { - pixelValue = PixelValue(-std::numeric_limits<float>::max()); - break; - } - case DataType::F16: - { - pixelValue = PixelValue(static_cast<half>(-65504.0f)); - break; - } - case DataType::QASYMM8: - { - pixelValue = - PixelValue(0, input->info()->data_type(), input->info()->quantization_info()); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported DataType"); - } - } - break; - } - default: - ARM_COMPUTE_ERROR("Reduction Operation unsupported"); - } - _fill_border_kernel.configure(input, fill_border_size, BorderMode::CONSTANT, pixelValue); - } -} - -void NEReductionOperationEx::run() -{ - if (_reduction_axis == 0) - { - NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY); - } - NEScheduler::get().schedule(&_reduction_kernel, _window_split); -} -} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp index aa165cc15..4675121b2 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -44,6 +44,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute::misc::shape_calculator; @@ -51,17 +52,9 @@ namespace arm_compute { NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT - : _memory_group(std::move(memory_manager)), - _conv_f(), - _upsample_f(), - _flip_weights(), - _scaled_output(), - _weights_flipped(), - _flip_axis(), - _original_weights(nullptr), - _input(nullptr), - _info(), - _is_prepared(false) + : _memory_group(std::move(memory_manager)), _conv_f(), _upsample_f(), _flip_weights(), + _scaled_output(), _weights_flipped(), _flip_axis(), _original_weights(nullptr), _input(nullptr), + _info(), _is_prepared(false) { } @@ -76,15 +69,15 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); const unsigned int width_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); auto out_dims = transposeconv_output_dimensions( - input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), - weights->dimension(height_idx), info, invalid_right, invalid_bottom); + input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), + weights->dimension(height_idx), info, invalid_right, invalid_bottom); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); if (bias != nullptr) @@ -117,24 +110,24 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf unsigned int pad_right = 0; unsigned int pad_top = 0; unsigned int pad_bottom = 0; - const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top, - pad_bottom); + const TensorShape scale_out_shape = + compute_transposeconv_upsampled_shape(*input, *weights, info, out_dims, invalid_right, + invalid_bottom, pad_left, pad_right, pad_top, pad_bottom); TensorInfo scale_out_info( - input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const unsigned int batches_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); const unsigned int channel_idx = - get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, - conv_info, WeightsInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo())); return Status{}; } @@ -146,21 +139,21 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate( - input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), - info, invalid_right, invalid_bottom)); + input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), + info, invalid_right, invalid_bottom)); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = - get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); auto out_dims = transposeconv_output_dimensions( - input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, - invalid_right, invalid_bottom); + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info, + invalid_right, invalid_bottom); const TensorShape output_shape = - compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); + compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info()); _input = input; _original_weights = weights; @@ -188,8 +181,8 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape( - *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, - pad_right, pad_top, pad_bottom); + *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left, + pad_right, pad_top, pad_bottom); const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom, DimensionRoundingType::FLOOR); |