diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2021-04-20 18:01:41 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2021-04-20 18:01:41 +0900 |
commit | 589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch) | |
tree | 47a2b23ce4220e3a4150c8b12ed941555272fb0c /compute | |
parent | 62529acabbafce7730601ed01d5709d7bc0d378a (diff) | |
download | nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2 nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip |
Imported Upstream version 1.15.0upstream/1.15.0submit/tizen/20210427.093759submit/tizen/20210423.055448submit/tizen/20210422.015846submit/tizen/20210421.062230accepted/tizen/unified/20210428.040443
Diffstat (limited to 'compute')
146 files changed, 17438 insertions, 364 deletions
diff --git a/compute/.clang-format b/compute/.clang-format deleted file mode 120000 index 0ff66f331..000000000 --- a/compute/.clang-format +++ /dev/null @@ -1 +0,0 @@ -../.clang-format.8
\ No newline at end of file diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h index 4a3717885..d3e116381 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -264,5 +264,5 @@ private: _program_source_map; /**< Contains sources for all programs. Used for compile-time kernel inclusion. >*/ }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h index a0aa0560b..46d4ae858 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h @@ -40,7 +40,7 @@ #ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H #define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h index bb6fcb8f5..eac866b67 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h @@ -41,8 +41,8 @@ #ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ #define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" #include "arm_compute/core/TypesEx.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h index ed668fd9c..cf671102e 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ #define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__ -#include "arm_compute/core/CL/ICLSimple3DKernel.h" +#include "src/core/CL/ICLSimple3DKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h index fb689f747..6729fb0f1 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ #define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h new file mode 100644 index 000000000..64908ab59 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H +#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H + +#include "src/core/CL/ICLKernel.h" + +namespace arm_compute +{ +/** Interface to add a bias to each row of the input tensor + * + */ +class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLGEMMMatrixAccumulateBiasesKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLGEMMMatrixAccumulateBiasesKernel & + operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types + * supported: Same as @p input + */ + void configure(ICLTensor *accum, const ICLTensor *biases); + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data + * types supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, ICLTensor *accum, + const ICLTensor *biases); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLGEMMMatrixAccumulateBiasesKernel + * + * @param[in] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types + * supported: Same as @p input + * @param[in] gpu_target GPU target + * + * @return a status + */ + static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_accum; + const ICLTensor *_biases; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h index 6630c7be7..a55f2401d 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__ #define __ARM_COMPUTE_CLGATHEREXKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h index 96f830898..f9d6f7cc5 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ #define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h index f57e799ad..7da9e9a4c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h new file mode 100644 index 000000000..4befdd05c --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H +#define ARM_COMPUTE_CLMEMSETKERNEL_H + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "src/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for filling the planes of a tensor */ +class CLMemsetKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLMemsetKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMemsetKernel(const CLMemsetKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLMemsetKernel &operator=(const CLMemsetKernel &) = delete; + /** Allow instances of this class to be moved */ + CLMemsetKernel(CLMemsetKernel &&) = default; + /** Allow instances of this class to be moved */ + CLMemsetKernel &operator=(CLMemsetKernel &&) = default; + /** Default destructor */ + ~CLMemsetKernel() = default; + + /** Initialise the kernel's tensor and filling value + * + * @param[in,out] tensor Input tensor to fill. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default + * is nullptr. + */ + void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr); + /** Initialise the kernel's tensor and filling value + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Input tensor to fill. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default + * is nullptr. + */ + void configure(const CLCompileContext &compile_context, ICLTensor *tensor, + const PixelValue &constant_value, Window *window = nullptr); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLMemsetKernel + * + * @param[in] tensor Source tensor info. Data types supported: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default is + * nullptr. + * + * @return a status + */ + static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, + Window *window = nullptr); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_tensor; + Window _full_window; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h index 90e8b5705..5394a062c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ #define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h index fa383c0d0..384050aff 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLNEGKERNEL_H__ #define __ARM_COMPUTE_CLNEGKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h index a512057b9..1d64f9f7d 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h @@ -39,7 +39,7 @@ */ #ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__ #define __ARM_COMPUTE_CLONEHOTKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h new file mode 100644 index 000000000..d4230aaf3 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H +#define ARM_COMPUTE_CLPADLAYERKERNELEX_H + +#include "src/core/CL/ICLKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the PadLayer function. */ +class CLPadLayerKernelEx : public ICLKernel +{ +public: + /** Default constructor */ + CLPadLayerKernelEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete; + /** Allow instances of this class to be moved */ + CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default; + /** Allow instances of this class to be moved */ + CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default; + /** Default destructor */ + ~CLPadLayerKernelEx() = default; + /** Set the input and output tensor. + * + * @param[in] input Source tensor. Data types supported: U8, S8, QASYMM8, + * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Set the input and output tensor. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The + * pair padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPadLayerKernelEx + * + * @param[in] input Source tensor info. Data types supported: U8, S8, QASYMM8, + * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32. + * @param[in] output Output tensor info. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + int _input_start_x; + int _input_start_y; + bool _4d_enabled; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h index 4e1b56cba..3f60db7bb 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ #define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h index 4f9042e41..548f29a27 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ #define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h index 4d4478ece..5f5b7f9b8 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ #define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h index aa4a14812..09073af7c 100644 --- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h +++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -47,7 +47,7 @@ #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ -#include "arm_compute/core/CL/ICLKernel.h" +#include "src/core/CL/ICLKernel.h" // these parameters can be changed #define _ITEMS 16 // number of items in a group diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h index 8c544cda8..c46b26170 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h @@ -41,15 +41,19 @@ #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ #define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ -#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h" #include "arm_compute/core/TypesEx.h" +#include "src/core/cpu/kernels/CpuElementwiseKernel.h" + namespace arm_compute { -class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel +class NEBinaryLogicalOperationKernel : public cpu::kernels::CpuComparisonKernel { public: + const char *name() const override { return "NEBinaryLogicalOperationKernel"; } + + NEBinaryLogicalOperationKernel() = default; /** Default destructor */ ~NEBinaryLogicalOperationKernel() = default; @@ -81,6 +85,10 @@ protected: // Inherited methods overridden: static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output); + + std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, + const Window &window)> + _function; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h index 101f6ac8e..036d56e69 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h @@ -40,7 +40,7 @@ #ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__ #define __ARM_COMPUTE_NECASTBOOLKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h index 88f21c96e..621500eb8 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ #define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h new file mode 100644 index 000000000..f8f7ac567 --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H +#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H + +#include "src/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; +/** NEON kernel to add a bias to each row of the input tensor */ +class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel +{ +public: + const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; } + /** Default constructor */ + NEGEMMMatrixAccumulateBiasesKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixAccumulateBiasesKernel & + operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Default destructor */ + ~NEGEMMMatrixAccumulateBiasesKernel() = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] accum The accumulate tensor to convert. Data type supported: F32 + * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type + * supported: Same as @p input + */ + void configure(ITensor *accum, const ITensor *biases); + /** Static function to check if given info will lead to a valid configuration of @ref + * NEGEMMMatrixAccumulateBiasesKernel + * + * @param[in] accum The accumulate tensor to convert. Data type supported: F32 + * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type + * supported: Same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *accum, const ITensorInfo *biases); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + ITensor *_accum; + const ITensor *_biases; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */ diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h index 5acfde5a8..a03e08ade 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__ #define __ARM_COMPUTE_NEGATHERKERNELEX_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h index cb2a485d5..fb3a72725 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ #define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h index 8724cc69b..1d786b59e 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ #define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h index 198b0be9d..ab534fe96 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ #define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h index 963d7b821..c1c9f7a3c 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h @@ -39,7 +39,7 @@ */ #ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__ #define __ARM_COMPUTE_NEONEHOTKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h index 0b080cf73..1fd5362ae 100644 --- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h +++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h @@ -41,7 +41,7 @@ #ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ #define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__ -#include "arm_compute/core/NEON/INEKernel.h" +#include "src/core/NEON/INEKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h index d57e8fcf5..d7ec1b4f0 100644 --- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h +++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h @@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_top); -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_UTILSEX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h index 484ebfd0b..664b8b3b1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h @@ -26,6 +26,7 @@ #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h> #include <arm_compute/runtime/CL/functions/CLNeg.h> #include <arm_compute/runtime/CL/functions/CLOneHot.h> +#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h> #include <arm_compute/runtime/CL/functions/CLReduceOperation.h> #include <arm_compute/runtime/CL/functions/CLSplitVEx.h> #include <arm_compute/runtime/CL/functions/CLTopKV2.h> diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h index b1ee52bf9..05bcc4075 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h @@ -41,8 +41,9 @@ #define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__ #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" -#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h" + #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" @@ -100,7 +101,7 @@ private: std::vector<CLTensor> _results_vector; CLTensor _not_reshaped_output; std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector; - CLReshapeLayerKernel _reshape_kernel; + CLReshapeLayer _reshape_kernel; unsigned int _num_of_stages; unsigned int _reduction_axis; }; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h index 88a9b00ec..fc4322798 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h @@ -43,6 +43,7 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/core/TypesEx.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h index d6150684a..854ddce52 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h @@ -67,5 +67,5 @@ public: */ void configure(ICLTensor *input, ICLTensor *output); }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CLCASTBOOL_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h index fbee7e40e..b0149cb09 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h @@ -73,5 +73,5 @@ public: */ void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h index f3266f688..c75ae9a50 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -43,14 +43,14 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" -#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" namespace arm_compute { @@ -182,5 +182,5 @@ private: bool _is_prepared; const ICLTensor *_original_weights; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h index f27e9913e..c08da526a 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -43,16 +43,14 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "src/core/CL/kernels/CLTransposeKernel.h" namespace arm_compute { @@ -132,9 +130,6 @@ private: * transpose_weights is set to true ) (called once) * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized * asymmetric) - * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref - * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is - * not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ @@ -157,40 +152,36 @@ public: * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor. The weights must be 2 dimensional. * If this function is called after a Convolution Layer, the (transposed) - * weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) - * weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. + * weights will have as many rows as the product of the first 3 input's dimensions. If it is + * called after another FullyConnected Layer, the (transposed) weights will have as many rows as + * the input's first dimension. Data type supported: Same as @p input. * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix * multiplication between: * - The output of im2col on the input and the (transposed) 2D weights, if the * function is called after a Convolution Layer * - The input tensor and the (transposed) 2D weights, if the function is - * called after another FullyConnected Layer. - * Data type supported: Same as @p input. + * called after another FullyConnected Layer. Data type supported: Same as @p input. * @param[in] fc_info (Optional) Fully connected layer additional info */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref - * CLFullyConnectedLayerEx + * CLFullyConnectedLayer * * @param[in] input Source tensor info. Data type supported: QASYMM8/F16/F32. * @param[in] weights Weights tensor info. The weights must be 2 dimensional. * If this function is called after a Convolution Layer, the (transposed) - * weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) - * weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. + * weights will have as many rows as the product of the first 3 input's dimensions. If it is + * called after another FullyConnected Layer, the (transposed) weights will have as many rows as + * the input's first dimension. Data type supported: Same as @p input. * @param[in] biases Bias tensor info. Can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor info. Its shape should be equal to the output of a * matrix multiplication between: * - The output of im2col on the input and the (transposed) 2D weights, if the * function is called after a Convolution Layer * - The input tensor and the (transposed) 2D weights, if the function is - * called after another FullyConnected Layer. - * Data type supported: Same as @p input. + * called after another FullyConnected Layer. Data type supported: Same as @p input. * @param[in] fc_info (Optional) Fully connected layer additional info * * @return a status diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h index 167554c9e..385eb0b2c 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h @@ -47,11 +47,14 @@ #ifndef __ARM_COMPUTE_CLGATHEREX_H__ #define __ARM_COMPUTE_CLGATHEREX_H__ +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** * @brief Class to to run @ref CLGatherKernel. @@ -81,5 +84,5 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLGATHEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h index 6618f5aa4..5e172a4c7 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h @@ -78,5 +78,5 @@ public: void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput, ICLTensor *output, ICLTensor *hits); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h index 887e7aaa5..02ae6d719 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h @@ -41,11 +41,14 @@ #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__ +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" namespace arm_compute { +class CLCompileContext; class ICLTensor; +class ITensorInfo; /** Basic function to perform a Instance normalization. * diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h index 2bbfca821..62a36f06d 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h @@ -39,9 +39,11 @@ */ #ifndef __ARM_COMPUTE_CLONEHOT_H__ #define __ARM_COMPUTE_CLONEHOT_H__ -#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" + #include "arm_compute/core/CL/kernels/CLOneHotKernel.h" +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/runtime/IFunction.h" + namespace arm_compute { class ICLTensor; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h new file mode 100644 index 000000000..ee1879aaa --- /dev/null +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLPADLAYEREX_H +#define ARM_COMPUTE_CLPADLAYEREX_H + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" +#include "src/core/gpu/cl/kernels/ClCopyKernel.h" +// #include "arm_compute/runtime/CL/functions/CLCopy.h" +#include <memory> + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels: + * + * -# @ref CLPadLayerKernelEx if there is padding to be added + * -# @ref CLCopyKernel otherwise + */ +class CLPadLayerEx : public IFunction +{ +public: + /** Default constructor */ + CLPadLayerEx(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerEx(const CLPadLayerEx &) = delete; + /** Default move constructor */ + CLPadLayerEx(CLPadLayerEx &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLPadLayerEx &operator=(const CLPadLayerEx &) = delete; + /** Default move assignment operator */ + CLPadLayerEx &operator=(CLPadLayerEx &&) = default; + + /** Initialize the function + * + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: All. + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The + * pair padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding. + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPadLayerEx. + * + * @param[in] input Source tensor info. Data types supported: All. + * @param[in] output Output tensor info. Data type supported: same as @p input + * @param[in] padding The padding for each spatial dimension of the input tensor. The pair + * padding[i] specifies the front and the end padding in the i-th dimension. + * @param[in] constant_value (Optional) Constant value to be used for the padding + * @param[in] mode (Optional) Controls whether the padding should be filled with @p + * constant_value using CONSTANT, or reflect the input, either including the border values + * (SYMMETRIC) or not (REFLECT). + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value = PixelValue(), + PaddingMode mode = PaddingMode::CONSTANT); + + // Inherited methods overridden: + void run() override; + +private: + void configure_reflect_mode(ICLTensor *input, ICLTensor *output); + + std::unique_ptr<CLPadLayerKernelEx> _pad_kernel; + std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel; + bool _perform_pad; +}; +} // namespace arm_compute +#endif /*ARM_COMPUTE_CLPADLAYEREX_H */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h index bb852e404..45eb72bef 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h @@ -116,5 +116,5 @@ private: std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr}; CLReshapeLayer _reshape; }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h index bb741d98d..3023df3f0 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h @@ -46,6 +46,9 @@ #include <vector> #include <memory> +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/runtime/CPP/functions/CPPSplit.h" + namespace arm_compute { class ICLTensor; @@ -82,5 +85,5 @@ private: unsigned int _num_splits; std::vector<CLSlice> _slice_functions; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLSPLITVEX__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h index e301a5152..f426a4d75 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -160,5 +160,5 @@ private: CLTopKV2Store _store_kernel; #endif }; -} +} // namespace arm_compute #endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h index efc296d6c..d0ddc2609 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h @@ -16,7 +16,6 @@ #ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__ #define __ARM_COMPUTE_NEFUNCTIONSEX_H__ -#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h> #include <arm_compute/runtime/NEON/functions/NECastBool.h> #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h> diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h index 026d30098..8d931f08d 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h @@ -41,8 +41,10 @@ #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ #define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__ +#include "arm_compute/core/Error.h" #include "arm_compute/core/TypesEx.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/core/ITensorInfo.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h index c8b08af8d..dd62645ee 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h @@ -41,16 +41,17 @@ #define __ARM_COMPUTE_NECASTBOOL_H__ #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute { class ITensor; +class ITensorInfo; /** - * @brief Class to run @ref NECastBoolKernel. + * @brief Class to run @ref INESimpleFunctionNoBorder. */ -class NECastBool : public INESimpleFunction +class NECastBool : public INESimpleFunctionNoBorder { public: /** Initialize the function's source, destination diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h index 63f7714aa..82a789e86 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h @@ -48,12 +48,14 @@ #define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/core/Error.h" #include <vector> namespace arm_compute { class ITensor; +class ITensorInfo; /** * @brief Class to perform EmbeddingLookup operation @@ -84,5 +86,5 @@ public: static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *lookups); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h index 56548a479..214592710 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -44,11 +44,11 @@ #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h" -#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" #include "arm_compute/runtime/Tensor.h" +#include "src/core/NEON/kernels/NETransposeKernel.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h index 8f98f220a..2bbb1fea1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -43,16 +43,16 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" -#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "src/core/NEON/kernels/NETransposeKernel.h" namespace arm_compute { @@ -79,11 +79,11 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete; /** Default move constructor */ - NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default; + NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete; /** Default move assignment operator */ - NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default; + NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete; /** Set the input and output tensors. * * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. @@ -141,7 +141,7 @@ private: void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); MemoryGroup _memory_group; - NEFlattenLayerKernel _flatten_kernel; + NEFlattenLayer _flatten_kernel; NEConvertFullyConnectedWeights _convert_weights; NEFullyConnectedLayerReshapeWeights _reshape_weights_function; NEGEMM _mm_gemm; diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h index 155a1b837..6944c77f6 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h @@ -47,6 +47,7 @@ namespace arm_compute { class ITensor; +class ITensorInfo; /** Basic function to run @ref NEGatherKernelEx */ class NEGatherEx : public INESimpleFunctionNoBorder diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h index 521a05ad9..f6fda60a9 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h @@ -48,12 +48,14 @@ #define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__ #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/core/Error.h" #include <vector> namespace arm_compute { class ITensor; +class ITensorInfo; /** * @brief Class to perform HashtableLookup operation @@ -96,5 +98,5 @@ public: const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *hits); }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h index 18e813923..0ee967698 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h @@ -54,6 +54,7 @@ namespace arm_compute { class ITensor; +class ITensorInfo; /** Basic function to perform a Instance normalization. * @@ -112,5 +113,5 @@ private: Tensor _permuted_input; Tensor _permuted_output; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h index 1a68f801a..668f024a1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h @@ -45,6 +45,8 @@ namespace arm_compute { // Forward declarations class ITensor; +class ITensorInfo; + /** Basic function to run @ref NEOneHotKernel */ class NEOneHot : public INESimpleFunctionNoBorder { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h index 91eec815c..9858e6c09 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h @@ -43,7 +43,7 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h index 48b416923..f34a8f8af 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h @@ -43,11 +43,13 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" +#include "arm_compute/runtime/Tensor.h" namespace arm_compute { diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h index 7a08dae97..f82579a45 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h @@ -102,9 +102,9 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete; /** Allow instances of this class to be moved */ - NETransposeConvLayer(NETransposeConvLayer &&) = default; + NETransposeConvLayer(NETransposeConvLayer &&) = delete; /** Allow instances of this class to be moved */ - NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default; + NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete; /** Default destructor */ virtual ~NETransposeConvLayer() = default; @@ -171,5 +171,5 @@ private: PadStrideInfo _info; bool _is_prepared; }; -} // arm_compute +} // namespace arm_compute #endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */ diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 1a8ff3e71..1a180a35b 100644 --- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -66,12 +66,16 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map {"gather_ex_1d", "gather_ex.cl"}, {"gather_ex_1d_out", "gather_ex.cl"}, {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"}, + {"gemm_accumulate_biases", "gemm.cl"}, {"hashtable_lookup", "hashtable_lookup.cl"}, {"instance_normalization_ex", "instance_normalization_ex.cl"}, + {"memset", "memset.cl"}, {"multiply_scale_factor", "multiply_scale_factor.cl"}, {"neg_tensor", "neg_tensor.cl"}, {"one_hot", "one_hot.cl"}, {"one_hot_only_on_value", "one_hot.cl"}, + {"pad_layer_constant", "pad_layer.cl"}, + {"pad_layer_symmetric_reflect", "pad_layer.cl"}, {"quantization_symm8", "quantization_symm8.cl"}, {"reduce_min_max", "reduce_operation.cl"}, {"reduce_sum_mean", "reduce_operation.cl"}, @@ -90,10 +94,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { #ifdef EMBEDDED_KERNELS { + "activation_float_helpers.h", +#include "./cl_kernels/activation_float_helpers.hembed" + }, + { "arg_min_max_ex.cl", #include "./cl_kernels/arg_min_max_ex.clembed" }, { + "binary_logical_op.cl", +#include "./cl_kernels/binary_logical_op.clembed" + }, + { "cast.cl", #include "./cl_kernels/cast.clembed" }, @@ -110,6 +122,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/gemmlowp_ex.clembed" }, { + "gemm_helpers.h", +#include "./cl_kernels/gemm_helpers.hembed" + }, + { "hashtable_lookup.cl", #include "./cl_kernels/hashtable_lookup.clembed" }, @@ -126,8 +142,12 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/instance_normalization_ex.clembed" }, { - "binary_logical_op.cl", -#include "./cl_kernels/binary_logical_op.clembed" + "gemm.cl", +#include "./cl_kernels/gemm.clembed" + }, + { + "memset.cl", +#include "./cl_kernels/memset.clembed" }, { "multiply_scale_factor.cl", @@ -142,6 +162,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/one_hot.clembed" }, { + "pad_layer.cl", +#include "./cl_kernels/pad_layer.clembed" + }, + { "quantization_symm8.cl", #include "./cl_kernels/quantization_symm8.clembed" }, @@ -150,6 +174,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map #include "./cl_kernels/reduce_operation.clembed" }, { + "repeat.h", +#include "./cl_kernels/repeat.hembed" + }, + { "scale_factor.cl", #include "./cl_kernels/scale_factor.clembed" }, diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h new file mode 100644 index 000000000..3c3ff8419 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +#if GPU_ARCH == GPU_ARCH_BIFROST +#define MLA(a, b, c) (fma(c, b, a)) +#else // GPU_ARCH == GPU_ARCH_BIFROST +#define MLA(a, b, c) ((b) * (c) + (a)) +#endif // GPU_ARCH == GPU_ARCH_BIFROST + +// Hard-Swish +#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \ + (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) + +// Logistic Activation +#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) + +// Hyperbolic Tangent Activation +#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) + +// RELU Tangent Activation +#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) + +// Bounded RELU Activation +#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) + +// Lower Upper Bounded RELU Activation +#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) + +// Leaky RELU Activation +#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \ + ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) + +// Soft RELU Activation +#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) + +// ELU Activation +#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \ + (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0))) + +// Absolute Activation +#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x)) + +// Square Activation +#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x) + +// Square-root Activation +#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x)) + +// Linear Activation +#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) + +// Identity Activation +#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x) + +#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL) + +#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl new file mode 100644 index 000000000..9b826a2bd --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl @@ -0,0 +1,7210 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "gemm_helpers.h" +#include "repeat.h" + +#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) +#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1) +#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2) +#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3) +#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7) +#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +#define CONCAT_INC(K0) INC##K0 +#define INC(K0) CONCAT_INC(K0) + +#if (SRC_WIDTH % K0) +#define BOUNDARY_CONDITION_X(x, a) \ + ({ \ + a = select( \ + 0, a, \ + CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \ + VEC_DATA_TYPE(DATA_TYPE, K0))); \ + }) +#else // (SRC_WIDTH % K0) +#define BOUNDARY_CONDITION_X(x, a) ({}) +#endif // (SRC_WIDTH % K0) + +/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks + * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. + * -DSRC_WIDTH=16) + * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. + * -DM0=2, -DK0=2). + * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at + * compile time using -DV0 (e.g. -DV0=2) + * @note Only the following values for M0, K0 and V0 are supported: + * M0: 2,3,4,5,6,7,8 + * K0: 2,3,4,8,16 + * V0: greater than 0 + * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer + * 1x1), the following information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * + * @param[in] src_ptr Pointer to the source LHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_INPUT_AS_3D) + */ +__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +) +{ + // Block size +#define BLOCK_SIZE ((M0) * (K0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (K0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (K0) * (V0) +#else // Do not interleave +#define OUTPUT_STEP_X (K0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + + ((y / (uint)V0) * (uint)dst_stride_y) + + ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)); + + // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0; + REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src_stride_z by DEPTH_GEMM3D + + input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D; + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y); + +#else // defined(REINTERPRET_INPUT_AS_3D) + + input_ptr += z * (uint)src_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + output_ptr += z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + // Load values from the LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); + BOUNDARY_CONDITION_X(x, a0); +#if M0 > 1 + BOUNDARY_CONDITION_X(x, a1); +#endif // M0 > 1 +#if M0 > 2 + BOUNDARY_CONDITION_X(x, a2); +#endif // M0 > 2 +#if M0 > 3 + BOUNDARY_CONDITION_X(x, a3); +#endif // M0 > 3 +#if M0 > 4 + BOUNDARY_CONDITION_X(x, a4); +#endif // M0 > 4 +#if M0 > 5 + BOUNDARY_CONDITION_X(x, a5); +#endif // M0 > 5 +#if M0 > 6 + BOUNDARY_CONDITION_X(x, a6); +#endif // M0 > 6 +#if M0 > 7 + BOUNDARY_CONDITION_X(x, a7); +#endif // M0 > 7 + // ---------------------------Store output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} + +#if M0 == 2 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 3 // M0 == 3 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 4 // M0 == 4 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#elif M0 == 5 // M0 == 5 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + DATA_TYPE res1 = a4.s##i; \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \ + }) +#elif M0 == 6 // M0 == 6 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VEC_DATA_TYPE(DATA_TYPE, 2) \ + res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + VSTORE(2) \ + (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \ + }) +#elif M0 == 7 // M0 == 7 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, 4) \ + res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \ + VEC_DATA_TYPE(DATA_TYPE, 3) \ + res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \ + VSTORE(4) \ + (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + VSTORE(3) \ + (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \ + }) +#elif M0 == 8 // M0 == 8 +#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, M0) \ + res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \ + a6.s##i, a7.s##i); \ + VSTORE(M0) \ + (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \ + }) +#else // M0 not supported +#error "M0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks + * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. + * -DSRC_WIDTH=16) + * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. + * -DM0=2, -DK0=2). + * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at + * compile time using -DV0 (e.g. -DV0=2) + * @note Only the following values for M0, K0 and V0 are supported: + * M0: 2,3,4,5,6,7,8 + * K0: 2,3,4,8,16 + * V0: greater than 0 + * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer + * 1x1), the following information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * + * @param[in] src_ptr Pointer to the source LHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_INPUT_AS_3D) + */ +__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +) +{ + // Block size +#define BLOCK_SIZE ((M0) * (K0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (M0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (M0) * (V0) +#else // Do not interleave +#define OUTPUT_STEP_X (M0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + + ((y / (uint)V0) * (uint)dst_stride_y) + + ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)); + + // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0; + REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src_stride_z by DEPTH_GEMM3D + + input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D; + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y); + +#else // defined(REINTERPRET_INPUT_AS_3D) + + input_ptr += z * (uint)src_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + output_ptr += z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + + // Load values from the LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); + BOUNDARY_CONDITION_X(x, a0); +#if M0 > 1 + BOUNDARY_CONDITION_X(x, a1); +#endif // M0 > 1 +#if M0 > 2 + BOUNDARY_CONDITION_X(x, a2); +#endif // M0 > 2 +#if M0 > 3 + BOUNDARY_CONDITION_X(x, a3); +#endif // M0 > 3 +#if M0 > 4 + BOUNDARY_CONDITION_X(x, a4); +#endif // M0 > 4 +#if M0 > 5 + BOUNDARY_CONDITION_X(x, a5); +#endif // M0 > 5 +#if M0 > 6 + BOUNDARY_CONDITION_X(x, a6); +#endif // M0 > 6 +#if M0 > 7 + BOUNDARY_CONDITION_X(x, a7); +#endif // M0 > 7 + // ---------------------------Transpose and store block ----------------------- + + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1); +#if K0 > 2 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2); +#endif // K0 > 2 +#if K0 > 3 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3); +#endif // K0 > 3 +#if K0 > 4 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7); +#endif // K0 > 4 +#if K0 > 8 + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E); + TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F); +#endif // K0 > 8 + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} +#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) + +#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT) +/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks + * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. + * -DSRC_HEIGHT=16) + * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. + * -DK0=2, -DN0=2). + * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at + * compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * @note Only the following values for K0, N0 and H0 are supported: + * N0: 2,3,4,8,16 + * K0: 1,2,3,4,8,16 + * H0: greater than 0 + * + * @param[in] src_ptr Pointer to the source RHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Block size +#define BLOCK_SIZE ((K0) * (N0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (N0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (N0) * (H0) +#else // Do not interleave +#define OUTPUT_STEP_X (N0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + + z * (uint)src_stride_z; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + + ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, + 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0; + + // Load values from the RHS matrix + a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); +#if K0 > 1 + if (y * (uint)K0 + 1 < SRC_HEIGHT) + { + a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + } +#endif // K0 > 1 +#if K0 > 2 + if (y * (uint)K0 + 2 < SRC_HEIGHT) + { + a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); + } +#endif // K0 > 2 +#if K0 > 3 + if (y * (uint)K0 + 3 < SRC_HEIGHT) + { + a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y)); + } +#endif // K0 > 3 +#if K0 > 4 + if (y * (uint)K0 + 4 < SRC_HEIGHT) + { + a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y)); + } + if (y * (uint)K0 + 5 < SRC_HEIGHT) + { + a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y)); + } + if (y * (uint)K0 + 6 < SRC_HEIGHT) + { + a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y)); + } + if (y * (uint)K0 + 7 < SRC_HEIGHT) + { + a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y)); + } +#endif // K0 > 4 +#if K0 > 8 + if (y * (uint)K0 + 8 < SRC_HEIGHT) + { + a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y)); + } + if (y * (uint)K0 + 9 < SRC_HEIGHT) + { + a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y)); + } + if (y * (uint)K0 + 10 < SRC_HEIGHT) + { + aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y)); + } + if (y * (uint)K0 + 11 < SRC_HEIGHT) + { + aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y)); + } + if (y * (uint)K0 + 12 < SRC_HEIGHT) + { + aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y)); + } + if (y * (uint)K0 + 13 < SRC_HEIGHT) + { + aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y)); + } + if (y * (uint)K0 + 14 < SRC_HEIGHT) + { + aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y)); + } + if (y * (uint)K0 + 15 < SRC_HEIGHT) + { + aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y)); + } +#endif // K0 > 8 + + // ---------------------------Store output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} + +#if defined(TRANSPOSE) +/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks + * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values. + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. + * -DSRC_HEIGHT=16) + * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. + * -DK0=2, -DN0=2). + * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at + * compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile + * time. + * @note The option -DTRANSPOSE must passed at compile time. + * @note Only the following values for K0, N0 and H0 are supported: + * N0: 2,3,4,8,16 + * K0: 2,3,4,8,16 + * H0: greater than 0 + * + * @param[in] src_ptr Pointer to the source RHS tensor. Supported data + * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS + * tensor + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Block size +#define BLOCK_SIZE ((K0) * (N0)) + + // Output offset X +#if defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (K0) +#else // defined(INTERLEAVE) +#define OUTPUT_OFFSET_X (BLOCK_SIZE) +#endif // defined(INTERLEAVE) + + // Output step X +#if defined(INTERLEAVE) +#define OUTPUT_STEP_X (K0) * (H0) +#else // Do not interleave +#define OUTPUT_STEP_X (K0) +#endif // defined(INTERLEAVE) + + // Compute source and destination addresses + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + + // ------------------ Compute input/output addresses --------------------------- + + // Compute the input address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + + z * (uint)src_stride_z; + + // Compute the output address + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + + ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z; + + // ---------------------------Load input values -------------------------------- + REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0; + + // Load values from the RHS matrix + a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); + if (y * (uint)K0 + 1 < SRC_HEIGHT) + { + a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + } +#if K0 > 2 + if (y * (uint)K0 + 2 < SRC_HEIGHT) + { + a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y)); + } +#endif // K0 > 2 +#if K0 > 3 + if (y * (uint)K0 + 3 < SRC_HEIGHT) + { + a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y)); + } +#endif // K0 > 3 +#if K0 > 4 + if (y * (uint)K0 + 4 < SRC_HEIGHT) + { + a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y)); + } + if (y * (uint)K0 + 5 < SRC_HEIGHT) + { + a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y)); + } + if (y * (uint)K0 + 6 < SRC_HEIGHT) + { + a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y)); + } + if (y * (uint)K0 + 7 < SRC_HEIGHT) + { + a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y)); + } +#endif // K0 > 4 +#if K0 > 8 + if (y * (uint)K0 + 8 < SRC_HEIGHT) + { + a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y)); + } + if (y * (uint)K0 + 9 < SRC_HEIGHT) + { + a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y)); + } + if (y * (uint)K0 + 10 < SRC_HEIGHT) + { + aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y)); + } + if (y * (uint)K0 + 11 < SRC_HEIGHT) + { + aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y)); + } + if (y * (uint)K0 + 12 < SRC_HEIGHT) + { + aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y)); + } + if (y * (uint)K0 + 13 < SRC_HEIGHT) + { + aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y)); + } + if (y * (uint)K0 + 14 < SRC_HEIGHT) + { + aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y)); + } + if (y * (uint)K0 + 15 < SRC_HEIGHT) + { + aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y)); + } +#endif // K0 > 8 + + // ---------------------------Transpose the block ------------------------------ + REPEAT_VAR_INIT_TO_CONST( + N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, + 0); // VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0; + +#if K0 == 2 + // This part computes the following transpositions: + // 2x2 -> 2x2 + // 2x4 -> 4x2 + // 2x8 -> 8x2 + // 2x16 -> 16x2 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF); +#endif // N0 > 8 + +#elif K0 == 3 // K0 == 2 + // This part computes the following transpositions: + // 3x2 -> 2x3 + // 3x4 -> 4x3 + // 3x8 -> 8x3 + // 3x16 -> 16x3 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF); +#endif // N0 > 8 + +#elif K0 == 4 // K0 == 4 + // This part computes the following transpositions: + // 4x2 -> 2x4 + // 4x4 -> 4x4 + // 4x8 -> 8x4 + // 4x16 -> 16x4 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF); +#endif // N0 > 8 + +#elif K0 == 8 // K0 == 8 + // This part computes the following transpositions: + // 8x2 -> 2x8 + // 8x4 -> 4x8 + // 8x8 -> 8x8 + // 8x16 -> 16x8 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF); +#endif // N0 > 8 + +#elif K0 == 16 // K0 == 16 + + // This part computes the following transpositions: + // 16x2 -> 2x16 + // 16x4 -> 4x16 + // 16x8 -> 8x16 + // 16x16 -> 16x16 + res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0, + a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0); + res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1, + a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1); +#if N0 > 2 + res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2, + a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2); +#endif // N0 > 2 +#if N0 > 3 + res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3, + a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3); +#endif // N0 > 3 +#if N0 > 4 + res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4, + a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4); + res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5, + a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5); + res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6, + a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6); + res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7, + a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7); +#endif // N0 > 4 +#if N0 > 8 + res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8, + a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8); + res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9, + a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9); + resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA, + a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA); + resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB, + a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB); + resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC, + a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC); + resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD, + a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD); + resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE, + a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE); + resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF, + a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF); +#endif // N0 > 8 + +#else // N0 == 16 +#error "Not supported N0 value" +#endif // N0 > 2 + + // ---------------------------Store the output values ------------------------------ + REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0); + STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout); + +#undef BLOCK_SIZE +#undef OUTPUT_OFFSET_X +#undef OUTPUT_STEP_X +} +#endif // defined(TRANSPOSE) +#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT) + +#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \ + defined(M) && defined(N) && defined(K) + +#define CONCAT(a, b) a##b + +#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); }) +#define ARM_DOT2(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT2(a, b, c); \ + c = fma((a.s2), (b.s2), c); \ + }) +#define ARM_DOT4(a, b, c) \ + ({ \ + ARM_DOT3(a, b, c); \ + c = fma((a.s3), (b.s3), c); \ + }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) + +#if N0 == 2 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + }) +#elif N0 == 3 // N0 == 3 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + }) +#elif N0 == 4 // N0 == 4 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + }) +#elif N0 == 8 // N0 == 8 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##4), (c.s4)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##5), (c.s5)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##6), (c.s6)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##7), (c.s7)); \ + }) +#elif N0 == 16 // N0 == 16 +#define ARM_DOT_K0XN0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##0), (c.s0)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##1), (c.s1)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##2), (c.s2)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##3), (c.s3)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##4), (c.s4)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##5), (c.s5)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##6), (c.s6)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##7), (c.s7)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##8), (c.s8)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##9), (c.s9)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##A), (c.sA)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##B), (c.sB)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##C), (c.sC)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##D), (c.sD)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##E), (c.sE)); \ + CONCAT(ARM_DOT, k0) \ + ((a), (b##F), (c.sF)); \ + }) +#else // N0 not supported +#error "N0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS reshaped matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS reshaped matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(K0, a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(K0, a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(K0, a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(K0, a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(K0, a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(K0, a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(K0, a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(K0, a7, b, c7); +#endif // M0 > 7 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS reshaped matrix + LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(1, a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(1, a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(1, a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(1, a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(1, a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(1, a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(1, a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(1, a7, b, c7); +#endif // M0 > 7 + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#define VFMA(a, b, c) ({ c = fma(a, b, c); }) + +#if M0 == 1 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + }) +#elif M0 == 2 // M0 == 2 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + }) +#elif M0 == 3 // M0 == 3 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + }) +#elif M0 == 4 // M0 == 4 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + }) +#elif M0 == 5 // M0 == 5 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + }) +#elif M0 == 6 // M0 == 6 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + }) +#elif M0 == 7 // M0 == 7 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + }) +#elif M0 == 8 // M0 == 8 +#define LD_RHS_VFMA_M0xN0(i, a, c) \ + ({ \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + b = VLOAD(N0)( \ + 0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ + }) +#else // M0 not supported +#error "M0 not supported" +#endif // M0 not supported + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90). + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS reshaped matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); // uint zin0=0,zin1=0,zin2=0,... zin7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0; + +#if defined(REINTERPRET_INPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); + + LD_RHS_VFMA_M0xN0(0, a, c); + LD_RHS_VFMA_M0xN0(1, a, c); +#if K0 > 2 + LD_RHS_VFMA_M0xN0(2, a, c); +#endif // K0 > 2 +#if K0 > 3 + LD_RHS_VFMA_M0xN0(3, a, c); +#endif // K0 > 3 +#if K0 > 4 + LD_RHS_VFMA_M0xN0(4, a, c); + LD_RHS_VFMA_M0xN0(5, a, c); + LD_RHS_VFMA_M0xN0(6, a, c); + LD_RHS_VFMA_M0xN0(7, a, c); +#endif // K0 > 4 +#if K0 > 8 + LD_RHS_VFMA_M0xN0(8, a, c); + LD_RHS_VFMA_M0xN0(9, a, c); + LD_RHS_VFMA_M0xN0(A, a, c); + LD_RHS_VFMA_M0xN0(B, a, c); + LD_RHS_VFMA_M0xN0(C, a, c); + LD_RHS_VFMA_M0xN0(D, a, c); + LD_RHS_VFMA_M0xN0(E, a, c); + LD_RHS_VFMA_M0xN0(F, a, c); +#endif // K0 > 8 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); +#if M0 > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); +#endif // M0 > 1 +#if M0 > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); +#endif // M0 > 2 +#if M0 > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); +#endif // M0 > 3 +#if M0 > 4 + VEC_DATA_TYPE(DATA_TYPE, 2) + a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); +#endif // M0 > 4 +#if M0 > 5 + VEC_DATA_TYPE(DATA_TYPE, 2) + a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); +#endif // M0 > 5 +#if M0 > 6 + VEC_DATA_TYPE(DATA_TYPE, 2) + a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); +#endif // M0 > 6 +#if M0 > 7 + VEC_DATA_TYPE(DATA_TYPE, 2) + a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); +#endif // M0 > 7 + + LD_RHS_VFMA_M0xN0(0, a, c); + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && + // defined(M) && defined(N) && defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \ + defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N) + +#if defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c += a.s0 * b.s0; \ + c += a.s1 * b.s1; \ + c += a.s2 * b.s2; \ + c += a.s3 * b.s3; \ + c += a.s4 * b.s4; \ + c += a.s5 * b.s5; \ + c += a.s6 * b.s6; \ + c += a.s7 * b.s7; \ + c += a.s8 * b.s8; \ + c += a.s9 * b.s9; \ + c += a.sA * b.sA; \ + c += a.sB * b.sB; \ + c += a.sC * b.sC; \ + c += a.sD * b.sD; \ + c += a.sE * b.sE; \ + c += a.sF * b.sF; \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#else // defined(MIXED_PRECISION) +#if K0 == 2 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + }) +#elif K0 == 3 // K0 == 3 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + }) +#elif K0 == 4 // K0 == 4 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + }) +#elif K0 == 8 // K0 == 8 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + }) +#elif K0 == 16 // K0 == 16 +#define ARM_DOT_K0(a, b, c) \ + ({ \ + c = fma(a.s0, b.s0, c); \ + c = fma(a.s1, b.s1, c); \ + c = fma(a.s2, b.s2, c); \ + c = fma(a.s3, b.s3, c); \ + c = fma(a.s4, b.s4, c); \ + c = fma(a.s5, b.s5, c); \ + c = fma(a.s6, b.s6, c); \ + c = fma(a.s7, b.s7, c); \ + c = fma(a.s8, b.s8, c); \ + c = fma(a.s9, b.s9, c); \ + c = fma(a.sA, b.sA, c); \ + c = fma(a.sB, b.sB, c); \ + c = fma(a.sC, b.sC, c); \ + c = fma(a.sD, b.sD, c); \ + c = fma(a.sE, b.sE, c); \ + c = fma(a.sF, b.sF, c); \ + }) +#else // K0 not supported +#error "K0 value not supported" +#endif // K0 conditions +#endif // defined(MIXED_PRECISION) + +#if N0 == 2 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + }) +#elif N0 == 3 // N0 == 3 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + }) +#elif N0 == 4 // N0 == 4 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + }) +#elif N0 == 8 // N0 == 8 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + }) +#elif N0 == 16 // N0 == 16 +#define ARM_DOT_K0XN0(a, b, c) \ + ({ \ + ARM_DOT_K0((a), (b##0), (c.s0)); \ + ARM_DOT_K0((a), (b##1), (c.s1)); \ + ARM_DOT_K0((a), (b##2), (c.s2)); \ + ARM_DOT_K0((a), (b##3), (c.s3)); \ + ARM_DOT_K0((a), (b##4), (c.s4)); \ + ARM_DOT_K0((a), (b##5), (c.s5)); \ + ARM_DOT_K0((a), (b##6), (c.s6)); \ + ARM_DOT_K0((a), (b##7), (c.s7)); \ + ARM_DOT_K0((a), (b##8), (c.s8)); \ + ARM_DOT_K0((a), (b##9), (c.s9)); \ + ARM_DOT_K0((a), (b##A), (c.sA)); \ + ARM_DOT_K0((a), (b##B), (c.sB)); \ + ARM_DOT_K0((a), (b##C), (c.sC)); \ + ARM_DOT_K0((a), (b##D), (c.sD)); \ + ARM_DOT_K0((a), (b##E), (c.sE)); \ + ARM_DOT_K0((a), (b##F), (c.sF)); \ + }) +#else // N0 not supported +#error "N0 value not supported" +#endif // N0 conditions + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT + * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 + * must be transposed + * + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The data type used for the accumulators must be passed at compile time using + * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float) + * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION + * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: F16/F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS + * reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + +#if defined(DUMMY_WORK_ITEMS) + if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + + (get_global_id(1) / V0) * (uint)lhs_stride_y + + (get_global_id(2) * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (get_global_id(0) / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += get_global_id(2) * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + + for (int i = 0; i < k; i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero); + + // Accumulate + ARM_DOT_K0XN0(a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(a7, b, c7); +#endif // M0 > 7 + + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); + rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, + dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += get_global_id(2) * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#if defined(LHS_TRANSPOSE) + +#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) + +#if defined(MIXED_PRECISION) + +#if (GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) \ + c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \ + (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) \ + c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \ + (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#else // defined(MIXED_PRECISION + +#if (GPU_ARCH == GPU_ARCH_MIDGARD) +#define ARM_VFMA(N0, a, b, c) c += (a) * (b); +#else // GPU_ARCH == GPU_ARCH_MIDGARD +#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c)); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + +#endif // defined(MIXED_PRECISION) + +#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); }) +#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \ + }) +#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \ + }) +#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \ + }) +#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \ + ({ \ + ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \ + ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \ + }) + +// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. +// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output +// matrix Lower case is a vector (a, b) Upper case is a matrix (C) +#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C) + +#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \ + ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); }) +#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \ + }) +#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \ + }) +#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \ + }) +#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \ + ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \ + }) +#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \ + ({ \ + ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \ + ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \ + }) + +// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication. +// The dimensions for this matrix multiplications are defined through M0, N0 and K0 +// The dimensions supported are: +// M0: 1, 2, 3, 4, 8 +// N0: 1, 2, 3, 4, 8, 16 +// K0: 1, 2, 3, 4, 8, 16 +// This macro calls the vector-by-matrix macro K0 times +// A, B and C are matrices +#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \ + CONCAT(ARM_MM_T_NT_M0xN0x, K0) \ + (M0, N0, TYPE, A, B, C) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be + * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 + * must be NOT transposed + * + * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. + * -DLHS_TRANSPOSE). + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: F16/F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS + * reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS + * reshaped matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (M0) +#define LHS_STEP_X ((M0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (M0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (N0) +#define RHS_STEP_X ((N0) * (H0)) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (N0) +#endif // defined(RHS_INTERLEAVE) + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); + + // Compute RHS matrix address + __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); + + __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); + __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); + + for (int i = 0; i < k; i += K0) + { + VEC_DATA_TYPE(DATA_TYPE, M0) + a0 = VLOAD(M0)(0, lhs); + VEC_DATA_TYPE(DATA_TYPE, N0) + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + +#if K0 > 1 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 1 + +#if K0 > 2 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 2 + +#if K0 > 3 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 3 + +#if K0 > 4 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 4 + +#if K0 > 8 + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = VLOAD(N0)(0, rhs); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + rhs += RHS_STEP_X; +#endif // K0 > 8 + +#ifndef LHS_INTERLEAVE + lhs += (M0 * K0 * (V0 - 1)); +#endif // LHS_INTERLEAVE + +#ifndef RHS_INTERLEAVE + rhs += (N0 * K0 * (H0 - 1)); +#endif // RHS_INTERLEAVE + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = + bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + + z * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#endif // defined(LHS_TRANSPOSE) + +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && + // defined(DATA_TYPE) + +#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) + +#define VFMA(a, b, c) ({ c = fma(a, b, c); }) + +#if M0 == 1 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); }) +#elif M0 == 2 // M0 == 2 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + }) +#elif M0 == 3 // M0 == 3 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + }) +#elif M0 == 4 // M0 == 4 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + }) +#elif M0 == 5 // M0 == 5 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + }) +#elif M0 == 6 // M0 == 6 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + }) +#elif M0 == 7 // M0 == 7 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + }) +#elif M0 == 8 // M0 == 8 +#define RHS_VFMA_M0xN0(i, a, b, c) \ + ({ \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ + VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ + }) +#else // M0 not supported +#error "M0 not supported" +#endif // M0 not supported + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is NOT reshaped + * + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK + * (e.g. -DM=52, -DN=30 and -DK=90) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64) + * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2) + * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., + * -DK0=2) + * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: + * F16/F32 + * @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes) + * @param[in] lhs_step_x lhs_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes) + * @param[in] lhs_step_y lhs_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix + * @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: + * same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes) + * @param[in] rhs_step_x rhs_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes) + * @param[in] rhs_step_y rhs_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix + * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + for (; i <= (K - K0); i += K0) + { + // Supported cases (M0, K0): + // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 + // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 + // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 + // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 + // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 + // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 + // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 + // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero); + + RHS_VFMA_M0xN0(0, a, b0, c); + RHS_VFMA_M0xN0(1, a, b1, c); +#if K0 > 2 + RHS_VFMA_M0xN0(2, a, b2, c); +#endif // K0 > 2 +#if K0 > 3 + RHS_VFMA_M0xN0(3, a, b3, c); +#endif // K0 > 3 +#if K0 > 4 + RHS_VFMA_M0xN0(4, a, b4, c); + RHS_VFMA_M0xN0(5, a, b5, c); + RHS_VFMA_M0xN0(6, a, b6, c); + RHS_VFMA_M0xN0(7, a, b7, c); +#endif // K0 > 4 +#if K0 > 8 + RHS_VFMA_M0xN0(8, a, b8, c); + RHS_VFMA_M0xN0(9, a, b9, c); + RHS_VFMA_M0xN0(A, a, bA, c); + RHS_VFMA_M0xN0(B, a, bB, c); + RHS_VFMA_M0xN0(C, a, bC, c); + RHS_VFMA_M0xN0(D, a, bD, c); + RHS_VFMA_M0xN0(E, a, bE, c); + RHS_VFMA_M0xN0(F, a, bF, c); +#endif // K0 > 8 + + lhs_offset += K0 * sizeof(DATA_TYPE); + rhs_offset += K0 * rhs_stride_y; + } + + // Left-over accumulations + for (; i < K; ++i) + { + // Load values from LHS matrix + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0)); +#if M0 > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1)); +#endif // M0 > 1 +#if M0 > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2)); +#endif // M0 > 2 +#if M0 > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3)); +#endif // M0 > 3 +#if M0 > 4 + VEC_DATA_TYPE(DATA_TYPE, 2) + a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4)); +#endif // M0 > 4 +#if M0 > 5 + VEC_DATA_TYPE(DATA_TYPE, 2) + a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5)); +#endif // M0 > 5 +#if M0 > 6 + VEC_DATA_TYPE(DATA_TYPE, 2) + a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6)); +#endif // M0 > 6 +#if M0 > 7 + VEC_DATA_TYPE(DATA_TYPE, 2) + a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7)); +#endif // M0 > 7 + + VEC_DATA_TYPE(DATA_TYPE, N0) + b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y)); + RHS_VFMA_M0xN0(0, a, b, c); + + lhs_offset += sizeof(DATA_TYPE); + rhs_offset += rhs_stride_y; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(M0, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)M0 * bias_stride_y) + + get_global_id(2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(M0, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE) + +#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) +/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between + * matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global float *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float4 c0 = 0.0f; + float4 c1 = 0.0f; + float4 c2 = 0.0f; + float4 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + c0 += (float4)a0.s0 * b0; + c1 += (float4)a0.s1 * b0; + c2 += (float4)a0.s2 * b0; + c3 += (float4)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x4 block + vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between + * matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float4 c0 = 0.0f; + float4 c1 = 0.0f; + float4 c2 = 0.0f; + float4 c3 = 0.0f; + +#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH)) + + int i = 0; + for (; i <= (int)(COLS_MTX_B - 4); i += 4) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + } + + for (; i < (int)(COLS_MTX_B); ++i) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH; + + c0.s0 = fma(a0.s0, b0.s0, c0.s0); + c0.s1 = fma(a0.s0, b0.s1, c0.s1); + c0.s2 = fma(a0.s0, b0.s2, c0.s2); + c0.s3 = fma(a0.s0, b0.s3, c0.s3); + + c1.s0 = fma(a0.s1, b0.s0, c1.s0); + c1.s1 = fma(a0.s1, b0.s1, c1.s1); + c1.s2 = fma(a0.s1, b0.s2, c1.s2); + c1.s3 = fma(a0.s1, b0.s3, c1.s3); + + c2.s0 = fma(a0.s2, b0.s0, c2.s0); + c2.s1 = fma(a0.s2, b0.s1, c2.s1); + c2.s2 = fma(a0.s2, b0.s2, c2.s2); + c2.s3 = fma(a0.s2, b0.s3, c2.s3); + + c3.s0 = fma(a0.s3, b0.s0, c3.s0); + c3.s1 = fma(a0.s3, b0.s1, c3.s1); + c3.s2 = fma(a0.s3, b0.s2, c3.s2); + c3.s3 = fma(a0.s3, b0.s3, c3.s3); + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x4 block + vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +// Undefine local defines +#undef COLS_MTX_B + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and + * matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + half8 c0 = 0.0f; + half8 c1 = 0.0f; + half8 c2 = 0.0f; + half8 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + c0 += (half8)a0.s0 * b0; + c1 += (half8)a0.s1 * b0; + c2 += (half8)a0.s2 * b0; + c3 += (half8)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, half, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and + * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable. + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float8 c0 = 0.0f; + float8 c1 = 0.0f; + float8 c2 = 0.0f; + float8 c3 = 0.0f; + + for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT)); + b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + } + + for (; src_addr_b < src_end_addr_b; + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c0 += (float8)a0.s0 * b0; + c1 += (float8)a0.s1 * b0; + c2 += (float8)a0.s2 * b0; + c3 += (float8)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, float, c, ALPHA); +#endif // defined(ALPHA) + +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias_f, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias_f0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + float8 bias_f1 = convert_float8(bias1); + float8 bias_f2 = convert_float8(bias2); + float8 bias_f3 = convert_float8(bias3); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, float, bias_f, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias_f); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + half8 c_h0 = convert_half8(c0); + half8 c_h1 = convert_half8(c1); + half8 c_h2 = convert_half8(c2); + half8 c_h3 = convert_half8(c3); + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication + * between matrix A reshaped (src0) and matrix B reshaped (src1) + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at + * compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be + * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at + * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + half8 c0 = 0.0f; + half8 c1 = 0.0f; + half8 c2 = 0.0f; + half8 c3 = 0.0f; + +#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH)) + + int i = 0; + for (; i <= (int)(COLS_MTX_B - 4); i += 4) + { +#if MULT_INTERLEAVE4X4_HEIGHT == 1 + // Load values from matrix A (interleaved) and matrix B (transposed) + half8 a0 = vload8(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix B (transposed) + b0 = vload8(0, src_addr_b); + + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s4, b0, c0); + c1 = fma((half8)a0.s5, b0, c1); + c2 = fma((half8)a0.s6, b0, c2); + c3 = fma((half8)a0.s7, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload8(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix B (transposed) + b0 = vload8(0, src_addr_b); + + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s4, b0, c0); + c1 = fma((half8)a0.s5, b0, c1); + c2 = fma((half8)a0.s6, b0, c2); + c3 = fma((half8)a0.s7, b0, c3); +#else // MULT_INTERLEAVE4X4_HEIGHT == 1 + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = vload4(0, src_addr_a); + b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); +#endif // MULT_INTERLEAVE4X4_HEIGHT == 1 + } + + for (; i < (int)(COLS_MTX_B); ++i) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); + + src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT; + src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH; + + c0 = fma((half8)a0.s0, b0, c0); + c1 = fma((half8)a0.s1, b0, c1); + c2 = fma((half8)a0.s2, b0, c2); + c3 = fma((half8)a0.s3, b0, c3); + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(4, half, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(4, c, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(4, half, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(4, c, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store 4x8 block + vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); +} + +// Undefine local defines +#undef COLS_MTX_B + +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) + +#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) + +#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \ + (NUM_ELEMS_PROCESSED_PER_THREAD_Y) +#if defined(DATA_TYPE) +#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X) +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped. + * + * @note This OpenCL kernel works with floating point data types (F16/F32) + * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. + * -DDATA_TYPE=float) + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y + * @note The number of matrix A columns and the optional alpha's value need to be passed at compile + * time using -DCOLS_A and -DALPHA + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16/F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(DATA_TYPE); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE)); + + VECTOR_TYPE acc0 = 0.0f; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VECTOR_TYPE acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VECTOR_TYPE acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VECTOR_TYPE acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); + src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y)) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, + src0_stride_y, zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + VEC_DATA_TYPE(DATA_TYPE, 2) + a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + VEC_DATA_TYPE(DATA_TYPE, 2) + a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + VEC_DATA_TYPE(DATA_TYPE, 2) + a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + VEC_DATA_TYPE(DATA_TYPE, 2) + a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + VECTOR_TYPE b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1)); + VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)( + 0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + // Accumulate + acc0 += b0 * (VECTOR_TYPE)a0.s0; + acc0 += b1 * (VECTOR_TYPE)a0.s1; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * (VECTOR_TYPE)a1.s0; + acc1 += b1 * (VECTOR_TYPE)a1.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * (VECTOR_TYPE)a2.s0; + acc2 += b1 * (VECTOR_TYPE)a2.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * (VECTOR_TYPE)a3.s0; + acc3 += b1 * (VECTOR_TYPE)a3.s1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y)) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + VECTOR_TYPE b0 = + VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1)); + + // Accumulate + acc0 += b0 * (VECTOR_TYPE)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * (VECTOR_TYPE)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * (VECTOR_TYPE)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * (VECTOR_TYPE)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, + zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, + src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, + dst_addr, dst_stride_y, zout.s); +} +#endif // defined(DATA_TYPE) + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped + * + * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma + * units. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for matrix B + src_addr.s1 += idx * sizeof(float); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize accumulators + float4 acc0 = 0.0f; + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float4 acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float4 acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float4 acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // A and B src indices get incremented at the same time. + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A and matrix B + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A and matrix B + float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s0, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s0, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s0, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s0, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s0, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s0, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s0, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s0, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s0, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s0, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s0, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s0, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s0, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s0, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s0, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s0, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s1, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s1, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s1, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s1, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s1, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s1, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s1, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s1, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s1, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s1, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s1, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s1, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s1, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s1, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s1, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s1, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s2, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s2, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s2, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s2, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s2, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s2, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s2, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s2, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s2, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s2, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s2, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s2, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s2, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s2, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s2, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s2, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // Load values from matrix A and matrix B + b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s3, b0.s0, acc0.s0); + acc0.s1 = fma(a0.s3, b0.s1, acc0.s1); + acc0.s2 = fma(a0.s3, b0.s2, acc0.s2); + acc0.s3 = fma(a0.s3, b0.s3, acc0.s3); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + + acc1.s0 = fma(a1.s3, b0.s0, acc1.s0); + acc1.s1 = fma(a1.s3, b0.s1, acc1.s1); + acc1.s2 = fma(a1.s3, b0.s2, acc1.s2); + acc1.s3 = fma(a1.s3, b0.s3, acc1.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + + acc2.s0 = fma(a2.s3, b0.s0, acc2.s0); + acc2.s1 = fma(a2.s3, b0.s1, acc2.s1); + acc2.s2 = fma(a2.s3, b0.s2, acc2.s2); + acc2.s3 = fma(a2.s3, b0.s3, acc2.s3); + +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + acc3.s0 = fma(a3.s3, b0.s0, acc3.s0); + acc3.s1 = fma(a3.s3, b0.s1, acc3.s1); + acc3.s2 = fma(a3.s3, b0.s2, acc3.s2); + acc3.s3 = fma(a3.s3, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(float); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0, b0.s0, acc0.s0); + acc0.s1 = fma(a0, b0.s1, acc0.s1); + acc0.s2 = fma(a0, b0.s2, acc0.s2); + acc0.s3 = fma(a0, b0.s3, acc0.s3); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1.s0 = fma(a1, b0.s0, acc1.s0); + acc1.s1 = fma(a1, b0.s1, acc1.s1); + acc1.s2 = fma(a1, b0.s2, acc1.s2); + acc1.s3 = fma(a1, b0.s3, acc1.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2.s0 = fma(a2, b0.s0, acc2.s0); + acc2.s1 = fma(a2, b0.s1, acc2.s1); + acc2.s2 = fma(a2, b0.s2, acc2.s2); + acc2.s3 = fma(a2, b0.s3, acc2.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3.s0 = fma(a3, b0.s0, acc3.s0); + acc3.s1 = fma(a3, b0.s1, acc3.s1); + acc3.s2 = fma(a3, b0.s2, acc3.s2); + acc3.s3 = fma(a3, b0.s3, acc3.s3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float); + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)); + + LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not been reshaped + * + * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma + * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or + * equal to 1000. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if + * alpha!=1.0f. + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for + // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(float); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + // Initialize accumulators + float2 acc0 = 0.0f; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float2 acc1 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float2 acc2 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float2 acc3 = 0.0f; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + // A and B src indices get incremented at the same time. + int i = 0; + for (; i <= ((int)COLS_A - 8); i += 8) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0)); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0.s0, b0.s0, acc0.s0); + acc0.s0 = fma(a0.s1, b1.s0, acc0.s0); + acc0.s0 = fma(a0.s2, b2.s0, acc0.s0); + acc0.s0 = fma(a0.s3, b3.s0, acc0.s0); + acc0.s0 = fma(a0.s4, b4.s0, acc0.s0); + acc0.s0 = fma(a0.s5, b5.s0, acc0.s0); + acc0.s0 = fma(a0.s6, b6.s0, acc0.s0); + acc0.s0 = fma(a0.s7, b7.s0, acc0.s0); + + acc0.s1 = fma(a0.s0, b0.s1, acc0.s1); + acc0.s1 = fma(a0.s1, b1.s1, acc0.s1); + acc0.s1 = fma(a0.s2, b2.s1, acc0.s1); + acc0.s1 = fma(a0.s3, b3.s1, acc0.s1); + acc0.s1 = fma(a0.s4, b4.s1, acc0.s1); + acc0.s1 = fma(a0.s5, b5.s1, acc0.s1); + acc0.s1 = fma(a0.s6, b6.s1, acc0.s1); + acc0.s1 = fma(a0.s7, b7.s1, acc0.s1); + +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc1.s0 = fma(a0.s0, b0.s0, acc1.s0); + acc1.s0 = fma(a0.s1, b1.s0, acc1.s0); + acc1.s0 = fma(a0.s2, b2.s0, acc1.s0); + acc1.s0 = fma(a0.s3, b3.s0, acc1.s0); + acc1.s0 = fma(a0.s4, b4.s0, acc1.s0); + acc1.s0 = fma(a0.s5, b5.s0, acc1.s0); + acc1.s0 = fma(a0.s6, b6.s0, acc1.s0); + acc1.s0 = fma(a0.s7, b7.s0, acc1.s0); + + acc1.s1 = fma(a0.s0, b0.s1, acc1.s1); + acc1.s1 = fma(a0.s1, b1.s1, acc1.s1); + acc1.s1 = fma(a0.s2, b2.s1, acc1.s1); + acc1.s1 = fma(a0.s3, b3.s1, acc1.s1); + acc1.s1 = fma(a0.s4, b4.s1, acc1.s1); + acc1.s1 = fma(a0.s5, b5.s1, acc1.s1); + acc1.s1 = fma(a0.s6, b6.s1, acc1.s1); + acc1.s1 = fma(a0.s7, b7.s1, acc1.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc2.s0 = fma(a0.s0, b0.s0, acc2.s0); + acc2.s0 = fma(a0.s1, b1.s0, acc2.s0); + acc2.s0 = fma(a0.s2, b2.s0, acc2.s0); + acc2.s0 = fma(a0.s3, b3.s0, acc2.s0); + acc2.s0 = fma(a0.s4, b4.s0, acc2.s0); + acc2.s0 = fma(a0.s5, b5.s0, acc2.s0); + acc2.s0 = fma(a0.s6, b6.s0, acc2.s0); + acc2.s0 = fma(a0.s7, b7.s0, acc2.s0); + + acc2.s1 = fma(a0.s0, b0.s1, acc2.s1); + acc2.s1 = fma(a0.s1, b1.s1, acc2.s1); + acc2.s1 = fma(a0.s2, b2.s1, acc2.s1); + acc2.s1 = fma(a0.s3, b3.s1, acc2.s1); + acc2.s1 = fma(a0.s4, b4.s1, acc2.s1); + acc2.s1 = fma(a0.s5, b5.s1, acc2.s1); + acc2.s1 = fma(a0.s6, b6.s1, acc2.s1); + acc2.s1 = fma(a0.s7, b7.s1, acc2.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#if defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#else // defined(REINTERPRET_INPUT_AS_3D) + a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // defined(REINTERPRET_INPUT_AS_3D) + acc3.s0 = fma(a0.s0, b0.s0, acc3.s0); + acc3.s0 = fma(a0.s1, b1.s0, acc3.s0); + acc3.s0 = fma(a0.s2, b2.s0, acc3.s0); + acc3.s0 = fma(a0.s3, b3.s0, acc3.s0); + acc3.s0 = fma(a0.s4, b4.s0, acc3.s0); + acc3.s0 = fma(a0.s5, b5.s0, acc3.s0); + acc3.s0 = fma(a0.s6, b6.s0, acc3.s0); + acc3.s0 = fma(a0.s7, b7.s0, acc3.s0); + + acc3.s1 = fma(a0.s0, b0.s1, acc3.s1); + acc3.s1 = fma(a0.s1, b1.s1, acc3.s1); + acc3.s1 = fma(a0.s2, b2.s1, acc3.s1); + acc3.s1 = fma(a0.s3, b3.s1, acc3.s1); + acc3.s1 = fma(a0.s4, b4.s1, acc3.s1); + acc3.s1 = fma(a0.s5, b5.s1, acc3.s1); + acc3.s1 = fma(a0.s6, b6.s1, acc3.s1); + acc3.s1 = fma(a0.s7, b7.s1, acc3.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float) * 8; + } + // float size increment + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Multiply and accumulate + acc0.s0 = fma(a0, b0.s0, acc0.s0); + acc0.s1 = fma(a0, b0.s1, acc0.s1); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1.s0 = fma(a1, b0.s0, acc1.s0); + acc1.s1 = fma(a1, b0.s1, acc1.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2.s0 = fma(a2, b0.s0, acc2.s0); + acc2.s1 = fma(a2, b0.s1, acc2.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3.s0 = fma(a3, b0.s0, acc3.s0); + acc3.s1 = fma(a3, b0.s1, acc3.s1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += sizeof(float); + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)); + + LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not beed reshaped + * + * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating + * the result in a 32 floating point variable. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(half); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + float8 acc0 = 0.0h; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float8 acc1 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float8 acc2 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float8 acc3 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + + // Accumulate + acc0 = fma(b0, (float8)a0.s0, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s0, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s0, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s0, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s1, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s1, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s1, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s1, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s2, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s2, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s2, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s2, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (float8)a0.s3, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1.s3, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2.s3, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3.s3, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(half); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1))); + + src_addr += (int2)(sizeof(half), src1_stride_y); + + // Accumulate + acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA); +#endif // defined(ALPHA) + +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, float, bias_f, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + + float8 bias_f0 = convert_float8(bias0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float8 bias_f1 = convert_float8(bias1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float8 bias_f2 = convert_float8(bias2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float8 bias_f3 = convert_float8(bias3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + + half8 acc_h0 = convert_half8(acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half8 acc_h1 = convert_half8(acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half8 acc_h2 = convert_half8(acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half8 acc_h3 = convert_half8(acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s); +} + +/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and + * matrix B (src1) in case both matrices have not beed reshaped + * + * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma + * units. + * @note The number of elements processed along the x and y directions must be passed at compile + * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel + * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. + * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. + * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid + * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using + * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the + * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have + * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. + * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed + * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is + * performed after the bias addition + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported + * data type: same as @p lhs_ptr + * @param[in] src2_stride_x (Optional) Stride of the bias matrix in X + * dimension (in bytes) + * @param[in] src2_step_x (Optional) src2_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y + * dimension (in bytes) + * @param[in] src2_step_y (Optional) src2_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the + * bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z + * dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for + * the input tensor (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), +#if defined(BETA) + IMAGE_DECLARATION(src2), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), uint src0_stride_z, + uint src1_stride_z, +#if defined(BETA) + uint src2_stride_z, +#endif // defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint src_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X; + + // Compute starting address for matrix A and Matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + + // Update address for the matrix A + src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y; + + // Update address for the matrix B + src_addr.s1 += idx * sizeof(half); + +#if defined(REINTERPRET_INPUT_AS_3D) + // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zin = min(DEPTH_GEMM3D - 1, zin); + + // Add offset due to the cross plane paddings + zin *= (src_cross_plane_pad * src0_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply src0_stride_z by DEPTH_GEMM3D + src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + src_addr.s0 += get_global_id(2) * src0_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + half8 acc0 = 0.0h; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half8 acc1 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half8 acc2 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half8 acc3 = 0.0h; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + int i = 0; + for (; i <= ((int)COLS_A - 4); i += 4) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, + zin.s); +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + + // Accumulate + acc0 = fma(b0, (half8)a0.s0, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s0, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s0, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s0, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s1, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s1, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s1, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s1, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s2, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s2, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s2, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s2, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + src_addr.s1 += src1_stride_y; + acc0 = fma(b0, (half8)a0.s3, acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1.s3, acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2.s3, acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3.s3, acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + src_addr.s0 += 4 * sizeof(half); + } + + for (; i < (int)COLS_A; ++i) + { +#if defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#else // defined(REINTERPRET_INPUT_AS_3D) + // Load values from matrix A + half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y)); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y)); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Load values from matrix B + half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1)); + + src_addr += (int2)(sizeof(half), src1_stride_y); + + // Accumulate + acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0; +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3; +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + int z = get_global_id(2); + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + + uint4 zout = 0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across + // the z dimension in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) + // by HEIGHT_GEMM3D + zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / + (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (dst_cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) + REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0); + +#if defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)); + + LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, half, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias[broadcasted] + ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0); + +#else // defined(BROADCAST_BIAS) + __global uchar *src2_addr = + src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + + (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + + get_global_id(2) * src2_stride_z; + + LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA); +#endif // UNIT_BIAS + + // acc = acc + bias + ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias); + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) + ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL); +#endif // defined(ACTIVATION_TYPE) + + // Store the output block + STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s); +} +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) + +#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && + // (NUM_ELEMS_PROCESSED_PER_THREAD_Y) + +#if defined(BETA) +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account + * that the second matrix might be weighted by a scalar value beta: + * + * @note The beta's value need to be passed at compile time using -DBETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: + * F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] src_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Load values from A x B + float4 alpha_ab = vload4(0, (__global float *)dst.ptr); + + // Load values from Matrix C + float4 c = vload4(0, (__global float *)src.ptr); + + // Computes alpha * axb + beta * c + float4 out = alpha_ab + (float4)BETA * c; + + // Store final result in axb matrix + vstore4(out, 0, (__global float *)dst.ptr); +} + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account + * that the second matrix might be weighted by a scalar value beta: + * + * @note The beta's value need to be passed at compile time using -DBETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: + * F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] src_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Load values from A x B + half8 alpha_ab = vload8(0, (__global half *)dst.ptr); + + // Load values from Matrix C + half8 c = vload8(0, (__global half *)src.ptr); + + // Computes alpha * axb + beta * c + half8 out = alpha_ab + (half8)BETA * c; + + // Store final result in axb matrix + vstore8(out, 0, (__global half *)dst.ptr); +} +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(BETA) + +#if defined(WIDTH_VECTOR_A) +/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and + * matrix B (src1) used for locally connected layer + * + * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A + * + * @note The input A and matrix B must not be reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data + * types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data + * types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in + * bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in + * bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in + * bytes) + * @param[in] src1_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + */ +__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1), + IMAGE_DECLARATION(dst)) +{ + int idx = get_global_id(0) * 4; + int idy = get_global_id(1); + + // Compute the address for the vector A and matrix B + int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, + src1_offset_first_element_in_bytes + src1_stride_z * idy)); + src_addr.s1 += idx * sizeof(float); + + int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float)); + + float4 acc = 0.0f; + + for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); + src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y)) + { + float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y)); + + acc += b0 * (float4)a0.s0; + acc += b1 * (float4)a0.s1; + } + + for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y)) + { + float a0 = *((__global float *)(src0_ptr + src_addr.s0)); + float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1)); + + acc += b0 * (float4)a0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0))); +} +#endif // defined(WIDTH_VECTOR_A) + +/** This kernel accumulates each row with the biases vector. + * + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short. + * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16. + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported + * data type: U8/S8/U16/S16/F16/U32/S32/F32 + * @param[in] accum_stride_x Stride of the accmulate tensor in X + * dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y + * dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the + * accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as @p + * accum_ptr + * @param[in] biases_stride_x Stride of the destination tensor in X + * dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +#if defined(DATA_TYPE) && defined(VECTOR_SIZE) +__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases)) +{ + Image accum = CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); + + // Vector size, e.g. number of vector elements. + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr); + VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr); + accum_value = biases_value + accum_value; + // Store result in the accumulate buffer + VSTORE(VECTOR_SIZE) + (accum_value, 0, (__global DATA_TYPE *)accum.ptr); +} +#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h new file mode 100644 index 000000000..0c75d061f --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h @@ -0,0 +1,1235 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" + +/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). + * @name LOAD_ROW_n + * + * @param[in] N0 The number of rows to load + * @param[in] DATA_TYPE The data type of variables + * @param[in] BASENAME The basename of the destination variables for the loaded rows + * @param[in] PTR The base pointer + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The z-axis offset vector + * @{ + */ +#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); + +#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); + +#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); + +#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); + +#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); + +#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); + +#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); + +#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); + +#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); + +#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); + +#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); + +#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); + +#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); + +#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); + +#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); + +#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); + +/** @}*/ // end of group LOAD_ROW_n + +/** Load Blocks (consecutive rows and columns) with Z offset. + * @name LOAD_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 + * The data to load is expected to have consecutive names for each row. + * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2. + * + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] DATA_TYPE The data type of the target + * @param[in] BASENAME The basename of the result variables + * @param[in] PTR The base pointer for the data + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride in y-axis direction + * @param[in] Z The z-axis offset vector + * @{ + */ +#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ + LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) +/** @} */ // end of group LOAD_BLOCK + +/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). + * @name LOAD_ELEMENT_n + * + * @param[in] N0 The number of rows to load + * @param[in] DATA_TYPE The data type of variables + * @param[in] BASENAME The basename of the destination variables for the loaded rows + * @param[in] PTR The base pointer + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride value in y-axis direction + * @{ + */ +#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); + +#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); + +#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); + +#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); + +#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); + +#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); + +#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); + +#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); + +#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); + +#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); + +#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); + +#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); + +#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); + +#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); + +#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); + +#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + VEC_DATA_TYPE(DATA_TYPE, N0) \ + BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); + +/** @}*/ // end of group LOAD_ELEMENT_n + +/** Load Scalar as Vector (consecutive elements). + * @name LOAD_SCALAR_AS_VECTOR + * + * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16 + * The data to load is expected to have consecutive names for each row. + * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. + * + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive columns + * @param[in] DATA_TYPE The data type of the target + * @param[in] BASENAME The basename of the result variables + * @param[in] PTR The base pointer for the data + * @param[in] OFFSET The offset within a row + * @param[in] STRIDE_Y The stride in y-axis direction + * @{ + */ +#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ + LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) +/** @} */ // end of group LOAD_SCALAR_AS_VECTOR + +/** Basic macros to calculate Z offset values from Z0 to Zn-1 + * @name CALCULATE_Z_OFFSET_n + * + * @param[in] M0 The number of offset values to calculate + * @param[in] DATA_TYPE The data type of the results + * @param[in] Z The basename of the result variables + * @param[in] Y The work-itme ID of y-axis + * @param[in] HEIGHT_GEMM3D The height of GEMM3D + * @param[in] DEPTH_GEMM3D The depth of GEMM3D + * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension + * @param[in] STRIDE_Y The stride value in y-axis direction + * + * @{ + */ +#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ + Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ + Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ + Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ + Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ + Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ + Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ + Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); + +#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D; \ + Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ + Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); + +/** @} */ // end of group CALCULATE_Z_OFFSET_n + +/** Calculate Z offset values from Z0 to Zn-1 + * @name CALCULATE_Z_OFFSET + * + * The Z offsets are expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3. + * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account + * the possible cross plane paddings in case of the plance changes across the z-dimension. + * + * <!-- + * | | + * | plane0 | + * | | + * |__________________| + * |******************| + * | cross_plane_pad | + * |******************| + * | | + * | plane1 | + * | | + * |__________________| + * --> + * + * @param[in] M0 The number of offset values to calculate + * @param[in] DATA_TYPE The data type of the results + * @param[in] Z The basename of the result variables + * @param[in] Y The work-itme ID of y-axis + * @param[in] HEIGHT_GEMM3D The height of GEMM3D + * @param[in] DEPTH_GEMM3D The depth of GEMM3D + * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension + * @param[in] STRIDE_Y The stride value in y-axis direction + * @{ + */ +#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) +#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) \ + CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \ + STRIDE_Y) +/** @} */ // end of group CALCULATE_Z_OFFSET + +/** Store the 0 to (n-1)th rows of the given variables + * @name STORE_ROW_n + * + * @param[in] N0 The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); + +#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); + +#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); + +#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); + +#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); + +#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); + +#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); + +#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); + +#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); + +#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); + +#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); + +#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); + +#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); + +#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); + +#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); + +#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); +/** @} */ // end of groupd STORE_ROW_n + +/** Convert and store the 0th to (n-1)th rows of the given variables + * @name CONVERT_STORE_ROW_n + * + * @param[in] N0 The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); + +#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); + +#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); + +#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); + +#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); + +#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); + +#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); + +#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); + +#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); + +#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); + +#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); + +#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); + +#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); + +#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); + +#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); + +#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + VSTORE(N0) \ + (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, \ + (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); + +/** @} */ // end of groupd CONVERT_STORE_ROW_n + +/** Store a block of the given size M0xN0 + * @name STORE_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. + * The data to store is expected to have consecutive names for each row. + * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. + * + * @param[in] M0 The number of rows to store + * @param[in] N0 The size of each vector + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +/** @} */ // end of group STORE_BLOCK + +/** Convert and store a block of the given size M0xN0 + * @name CONVERT_STORE_BLOCK + * + * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16. + * The data to store is expected to have consecutive names for each row. + * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2. + * The Z offset is expected to have consecutive names. + * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2. + * + * @param[in] M0 The number of rows to store + * @param[in] N0 The size of each vector + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] PTR The base pointer + * @param[in] STRIDE_Y The stride value in y-axis direction + * @param[in] Z The offset in z-axis direction + * @{ + */ +#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ + CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) +/** @} */ // end of group CONVERT_STORE_BLOCK + +/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1) + * @name SCALE_ROW_n + * + * @param[in] DATA_TYPE The data type of the variables + * @param[in] BASENAME The basename of the variables + * @param[in] SCALE The scale factor + * @{ + */ +#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##1 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##2 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##3 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##4 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##5 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##6 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##7 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##8 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##9 *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##A *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##B *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##C *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##D *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##E *= (DATA_TYPE)SCALE; + +#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ + SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ + BASENAME##F *= (DATA_TYPE)SCALE; +/** @} */ // end of group SCALE_ROW_n + +/** Scale elements stored in a block (BASENAME) + * @name SCALE_BLOCK + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] N The number of rows in the block + * @param[in] DATA_TYPE The data type of the block + * @param[in] BASENAME The basename of the block + * @param[in] SCALE The scale factor + * @{ + */ +#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) +#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) +/** @} */ // end of group SCALE_BLOCK + +/** Create a new vector containing the values at the given index for a set of given vectors + * @name COLUMN_VECTORn + * + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] X The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + * @{ + */ +#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ + TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); +#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 2) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); +#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 3) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); +#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 4) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \ + (X##2).s##IDX_COL, (X##3).s##IDX_COL); +#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))( \ + (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \ + (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); +#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))( \ + (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \ + (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \ + (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \ + (X##F).s##IDX_COL); +/** @} */ // end of group COLUMN_VECTORn + +/** Create a new vector containing the values at the given index. Utility macros for transposing a + * colum-vector + * @name COLUMN_VECTOR_SCALARn + * + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] X The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + * @{ + */ +#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0)); +#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 2) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); +#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 3) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); +#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 4) \ + BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); +#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 8) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); +#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ + VEC_DATA_TYPE(TYPE, 16) \ + BASENAME##IDX_COL = \ + (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \ + (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); +/** @} */ // end of group COLUMN_VECTORn + +/** Create transposed vectors of the given vectors + * @name TRANSPOSE_K0Xn + * + * @param[in] K0 The size of the source vectors + * @param[in] BASENAME The basename of transposed vectors + * @param[in] B The basename of source vectors for transposition + * @param[in] TYPE The data type of the transposed vectors + * @{ + */ +#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE); +#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \ + COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE); +#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X2(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE); +#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X3(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE); +#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X4(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE); +#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \ + TRANSPOSE_K0X8(K0, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, A, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, B, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, C, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, D, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, E, BASENAME, B, TYPE); \ + COLUMN_VECTOR(K0, F, BASENAME, B, TYPE); + +/** @} */ // end of group TRANSPOSE_K0Xn + +/** Create column vectors to contain the values at the given index for a set of given vectors + * + * @param[in] K0 The number of source vectors + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] B The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + */ +#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \ + CONCAT(COLUMN_VECTOR, K0) \ + (IDX_COL, BASENAME, B, TYPE); + +/** Create column vectors to contain the values at the given index. Utility macro for transposing a + * column-vector + * + * @param[in] K0 The number of source vectors + * @param[in] IDX_COL The index value + * @param[in] BASENAME The basename of the destination vectors + * @param[in] B The basename of the source vectors + * @param[in] TYPE The data type of the destination vectors + */ +#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \ + CONCAT(COLUMN_VECTOR_SCALAR, K0) \ + (IDX_COL, BASENAME, B, TYPE); + +/** Create transposed vectors form the given source vectors + * + * @param[in] K0 The size of source vectors + * @param[in] N0 The number of source vectors + * @param[in] BASENAME The basename of transposed vectors + * @param[in] B The basename of source vectors for transposition + * @param[in] TYPE The data type of the transposed vectors + * + */ +#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \ + CONCAT(TRANSPOSE_K0X, N0) \ + (K0, BASENAME, B, TYPE); + +/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1) + * @name ADD_ROW_n + * + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The basename of the added variables + * @{ + */ +#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0; + +#define ADD_ROW_2(BASENAME, BIAS) \ + ADD_ROW_1(BASENAME, BIAS) \ + BASENAME##1 += BIAS##1; + +#define ADD_ROW_3(BASENAME, BIAS) \ + ADD_ROW_2(BASENAME, BIAS) \ + BASENAME##2 += BIAS##2; + +#define ADD_ROW_4(BASENAME, BIAS) \ + ADD_ROW_3(BASENAME, BIAS) \ + BASENAME##3 += BIAS##3; + +#define ADD_ROW_5(BASENAME, BIAS) \ + ADD_ROW_4(BASENAME, BIAS) \ + BASENAME##4 += BIAS##4; + +#define ADD_ROW_6(BASENAME, BIAS) \ + ADD_ROW_5(BASENAME, BIAS) \ + BASENAME##5 += BIAS##5; + +#define ADD_ROW_7(BASENAME, BIAS) \ + ADD_ROW_6(BASENAME, BIAS) \ + BASENAME##6 += BIAS##6; + +#define ADD_ROW_8(BASENAME, BIAS) \ + ADD_ROW_7(BASENAME, BIAS) \ + BASENAME##7 += BIAS##7; + +#define ADD_ROW_9(BASENAME, BIAS) \ + ADD_ROW_8(BASENAME, BIAS) \ + BASENAME##8 += BIAS##8; + +#define ADD_ROW_10(BASENAME, BIAS) \ + ADD_ROW_9(BASENAME, BIAS) \ + BASENAME##9 += BIAS##9; + +#define ADD_ROW_11(BASENAME, BIAS) \ + ADD_ROW_10(BASENAME, BIAS) \ + BASENAME##A += BIAS##A; + +#define ADD_ROW_12(BASENAME, BIAS) \ + ADD_ROW_11(BASENAME, BIAS) \ + BASENAME##B += BIAS##B; + +#define ADD_ROW_13(BASENAME, BIAS) \ + ADD_ROW_12(BASENAME, BIAS) \ + BASENAME##C += BIAS##C; + +#define ADD_ROW_14(BASENAME, BIAS) \ + ADD_ROW_13(BASENAME, BIAS) \ + BASENAME##D += BIAS##D; + +#define ADD_ROW_15(BASENAME, BIAS) \ + ADD_ROW_14(BASENAME, BIAS) \ + BASENAME##E += BIAS##E; + +#define ADD_ROW_16(BASENAME, BIAS) \ + ADD_ROW_15(BASENAME, BIAS) \ + BASENAME##F += BIAS##F; + +/** @} */ // end of group ADD_ROW_n + +/** Add the block (BIAS) to another block (BASENAME) + * @name ADD_BLOCK + * + * Supported cases are N=1,2,3,...,16 + * + * @param[in] N The number of vectors in the block + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The basename of the added variables + * @{ + */ +#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) +#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) +/** @} */ // end of group ADD_BLOCK + +/** Broadcast (add single value) to the each element of the destination variables + * @name ADD_ROW_BROADCAST_n + * + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The variable containing the value to add + * @{ + */ +#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS; + +#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ + BASENAME##1 += BIAS; + +#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ + BASENAME##2 += BIAS; + +#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ + BASENAME##3 += BIAS; + +#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ + BASENAME##4 += BIAS; + +#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ + BASENAME##5 += BIAS; + +#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ + BASENAME##6 += BIAS; + +#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ + BASENAME##7 += BIAS; + +#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ + BASENAME##8 += BIAS; + +#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ + BASENAME##9 += BIAS; + +#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ + BASENAME##A += BIAS; + +#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ + BASENAME##B += BIAS; + +#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ + BASENAME##C += BIAS; + +#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ + BASENAME##D += BIAS; + +#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ + BASENAME##E += BIAS; + +#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ + ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ + BASENAME##F += BIAS; + +/** Broadcast (add a value) to the each element of the destination block (BASENAME) + * @name ADD_BLOCK_BROADCAST + * + * Supported cases are N=1,2,3,...,16. + * + * @param[in] N The number of vectors in the block + * @param[in] BASENAME The basename of the destination variables + * @param[in] BIAS The variable containing the value to add + * @{ + */ +#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) +#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) +/** @} */ // end of group ADD_BLOCK_BROADCAST + +/** Apply activation to the given variables + * @name ACTIVATION_ROW_n + * + * @param[in] ACTIVATION_TYPE The type of the activation + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] A_VAL Additional value required by the activation + * @param[in] B_VAL Additional value required by the activation + * @{ + */ +#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL); + +#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL); + +#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL); + +#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL); + +#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL); + +#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL); + +#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL); + +#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL); + +#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL); + +#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL); + +#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL); + +#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL); + +#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL); + +#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL); + +#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL); + +#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL); +/** @} */ // end of group ACTIVATION_ROW_n + +/** Apply activation to a block (BASENAME) + * @name ACTIVATION_BLOCK + * + * Supported cases are N=1,2,3,...,16. + * + * @param[in] N The number of vectors in the block + * @param[in] ACTIVATION_TYPE The type of the activation + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME The basename of the variables + * @param[in] A_VAL Additional value required by the activation + * @param[in] B_VAL Additional value required by the activation + * @{ + */ +#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) +#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \ + ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) +/** @} */ // end of group ACTIVATION_BLOCK + +/** Apply convert_<data_type> to the given variables + * @name CONVERT_ROW_n + * + * @param[in] N The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME_SRC The basename of the source variables + * @param[in] BASENAME_DST The basename of the destination variables + */ +#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); + +#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + VEC_DATA_TYPE(DATA_TYPE, N) \ + BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); +/** @} */ // end of group CONVERT_ROW_n + +/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST) + * @name CONVERT_BLOCK + * + * Supported cases N=1,2,3,...,16. + * + * @param[in] M The number of vectors to convert + * @param[in] N The size of the vectors + * @param[in] DATA_TYPE The data type of the vectors + * @param[in] BASENAME_SRC The basename of the source variables + * @param[in] BASENAME_DST The basename of the destination variables + */ +#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ + CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) +/** @} */ // end of group CONVERT_BLOCK diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl new file mode 100644 index 000000000..2d9acc753 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl @@ -0,0 +1,2733 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "gemm_helpers.h" +#include "helpers_asymm.h" +#include "repeat.h" + +#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \ + defined(cl_arm_integer_dot_product_accumulate_int8) +#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) +#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && + // defined(cl_arm_integer_dot_product_accumulate_int8) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16]. + * These macros use the dot8 instruction */ +#define ARM_DOT1(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ + }) +#define ARM_DOT2(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), \ + (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ + }) +#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) + +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] + * without using the dot8 instruction. */ +#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; }) +#define ARM_DOT2(a, b, c) \ + ({ \ + c += (ACC_DATA_TYPE)a.s0 * b.s0; \ + c += (ACC_DATA_TYPE)a.s1 * b.s1; \ + }) +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT2(a, b, c); \ + c += (ACC_DATA_TYPE)a.s2 * b.s2; \ + }) +#define ARM_DOT4(a, b, c) \ + ({ \ + ARM_DOT3(a, b, c); \ + c += (ACC_DATA_TYPE)a.s3 * b.s3; \ + }) +#define ARM_DOT8(a, b, c) \ + ({ \ + ARM_DOT4((a.lo), (b.lo), c); \ + ARM_DOT4((a.hi), (b.hi), c); \ + }) +#define ARM_DOT16(a, b, c) \ + ({ \ + ARM_DOT8((a.lo), (b.lo), c); \ + ARM_DOT8((a.hi), (b.hi), c); \ + }) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) + +/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 + * vectors "b" of size K0 [1,16] */ +#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); }) +#define ARM_DOT_K0X2(k0, a, b, c) \ + ({ \ + ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ + ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ + }) +#define ARM_DOT_K0X3(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X2(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ + }) +#define ARM_DOT_K0X4(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X3(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ + }) +#define ARM_DOT_K0X8(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X4(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ + ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ + ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ + ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ + }) +#define ARM_DOT_K0X16(k0, a, b, c) \ + ({ \ + ARM_DOT_K0X8(k0, a, b, c); \ + ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ + ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ + ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ + ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ + ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ + ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ + ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ + ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ + }) + +/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */ +#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); }) +#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X1(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ + }) +#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X2(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ + }) +#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X3(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ + }) +#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X4(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ + }) +#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X5(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ + }) +#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X6(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ + }) +#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ + ({ \ + ARM_MM_K0XN0X7(n0, k0, a, b, c); \ + ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ + }) + +#define ARM_DOT_K0(k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT, k0) \ + ((a), (b), (c)); \ + }) + +#define ARM_DOT_K0XN0(n0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_DOT_K0X, n0) \ + (k0, (a), b, (c)); \ + }) + +#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MM_K0XN0X, m0) \ + (n0, k0, a, b, c); \ + }) + +/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 + * vectors "b" of size K0 [1,16] */ +#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; }) +#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ + c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ + }) +#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ + }) +#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ + }) +#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ + c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ + c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ + c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ + }) +#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ + ({ \ + ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ + c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ + c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ + c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ + c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ + c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ + c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ + c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ + c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ + }) +/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */ +#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); }) +#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ + }) +#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ + }) +#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ + }) +#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ + }) +#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ + }) +#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ + }) +#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ + ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ + }) +#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MUL_N0X, k0) \ + (VECTOR_ACC_TYPE, (a), b, (c)); \ + }) +#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ + ({ \ + CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ + (VECTOR_ACC_TYPE, k0, a, b, c); \ + }) + +#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \ + defined(N) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices with + * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref + * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped + * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" + * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 + * and -DN=90). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) + * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS + * matrix must be passed at compile time using -DV0 (i.e. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option + * -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution + * layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8/QASYMM_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] k Number of columns in LHS matrix and rows in RHS + * matrix not reshaped. + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements + * (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + + (z * lhs_stride_z); + + // Compute RHS matrix address + __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_addr += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + for (int i = 0; i < k; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + // Update address + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP); + rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP); + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is + * transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in + * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + for (int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + lhs_offset += K0; + rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} + +#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage + * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref + * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at + * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4). + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS + * matrix must be passed at compile time using -DH0 (i.e. -DH0=2) + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option + * -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * - H0 >= 1 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be + * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed + * at compile time. + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. + * Supported data type: QASYMM8/QASYMM8_SIGNED + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in + * X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in + * Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in + * the LHS reshaped matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. + * Supported data type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in + * X dimension (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in + * Y dimension (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in + * the RHS reshaped matrix + * @param[out] dst_ptr Pointer to the destination matrix + * Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in + * X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in + * Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in + * Z dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in + * Z dimension (in bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS + * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the + * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: S32 + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: S32 + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: S32 + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint( + IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z, + uint rhs_stride_z, uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) +) +{ + // Block size +#define RHS_BLOCK_SIZE ((K0) * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (K0) +#define RHS_STEP_X ((K0) * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (K0) +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + + (x / (uint)H0) * rhs_stride_y; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0; + + for (int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs); + + // Partial matrix multiplication M0,N0,K0 + ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); + + lhs_offset += K0; + rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + // Result of MM is of type DATA_TYPE + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert result of matrix multiplication to S32 + REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int); + + int batch_id = z; +#if defined(DEPTH_GEMM3D) + batch_id /= (int)DEPTH_GEMM3D; +#endif // defined(DEPTH_GEMM3D) + + // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) + K_OFFSET; + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET); + +#if defined(A_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_col_addr = + sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + +#if defined(SUM_COL_HAS_BATCHES) + sum_col_addr += z * sum_col_stride_y; +#endif // defined(SUM_COL_HAS_BATCHES) + VEC_DATA_TYPE(int, N0) + a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr); + a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET; + + REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32); +#endif // defined(A_OFFSET) + +#if defined(B_OFFSET) + // Compute the offset contribution due to B_OFFSET + __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + + (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y; + +#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D) + sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int); +#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D) + LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x); + + REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET); +#endif // defined(B_OFFSET) + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = + biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + + VEC_DATA_TYPE(int, N0) + bias_values = VLOAD(N0)(0, (__global int *)bias_addr); + REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values); +#endif // defined(ADD_BIAS) + + REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_); + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = result_multipliers_ptr + + result_multipliers_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int); + + VEC_DATA_TYPE(int, N0) + res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr); + VEC_DATA_TYPE(int, N0) + res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr); + + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, + RESULT_SHIFT); +#else // RESULT_SHIFT >= 0 + REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER, + RESULT_SHIFT); +#endif // RESULT_SHIFT < 0 + +#endif // defined(PER_CHANNEL_QUANTIZATION) + + // Add the offset terms to GEMM's result + REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET); + +#if defined(MIN_BOUND) + REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Convert and store output block (does convert saturate) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout); + +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) +#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && + // defined(K) + +#if defined(M0) && defined(N0) && defined(K0) && defined(K) + +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. + * The LHS matrix is NOT reshaped + * The RHS matrix is NOT reshaped + * + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. + * -DACC_DATA_TYPE=uint) + * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64) + * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2) + * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2) + * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., + * -DK0=2) + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 1, 2, 3, 4, 5, 6, 7, 8 + * - N0 = 2, 3, 4, 8, 16 + * - K0 = 2, 3, 4, 8, 16 + * + * @note In case the input or output have to be reinterpreted as a 3D tensor, the following + * information must be passed at compile time: + * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D + * tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data + * type: QASYMM8 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension + * (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped + * matrix + * @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data + * type: same as @p lhs_ptr + * @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension + * (in bytes) + * @param[in] rhs_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension + * (in bytes) + * @param[in] rhs_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped + * matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension + * (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in + * bytes) + * @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit + * of elements (only if defined REINTERPRET_INPUT_AS_3D) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in + * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), + IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z, + uint dst_stride_z +#if defined(REINTERPRET_INPUT_AS_3D) + , + uint lhs_cross_plane_pad +#endif // REINTERPRET_INPUT_AS_3D +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D +) +{ + uint x = get_global_id(0); + uint y = get_global_id(1); + uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if ((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y; + + // Compute RHS matrix address + uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; +#else // defined(MATRIX_B_DEPTH) + rhs_offset += z * rhs_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); + REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); + +#if defined(REINTERPRET_INPUT_AS_3D) + // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, + lhs_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply lhs_stride_z by DEPTH_GEMM3D + lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_INPUT_AS_3D) + + // Add offset for batched GEMM + lhs_offset += z * lhs_stride_z; + +#endif // defined(REINTERPRET_INPUT_AS_3D) + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, + 0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; + + int i = 0; + + for (; i <= (K - K0); i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + + // Partial matrix multiplication M0,N0,K0 +#if (GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD + // Transpose the values from RHS matrix + TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); + + ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + + // Update the offset + lhs_offset += K0; + rhs_offset += K0 * rhs_stride_y; + } + + // Left-over for loop + for (; i < K; ++i) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); + + // Load values from RHS matrix + LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); + + // Partial matrix multiplication M0,N0,1 +#if (GPU_ARCH == GPU_ARCH_MIDGARD) + ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); +#else // GPU_ARCH == GPU_ARCH_MIDGARD + // Transpose the values from RHS matrix + TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); + + ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); +#endif // GPU_ARCH == GPU_ARCH_MIDGARD + + // Update the offset + lhs_offset += 1; + rhs_offset += rhs_stride_y; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0; + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, + dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Convert and store output block + CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout); +} +#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) + +#if defined(COLS_A) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix + * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at + * compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) + sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0; + ACC_DATA_TYPE sum_row = 0; + + __global const DATA_TYPE *matrix_a = + (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + + get_global_id(1) * src_stride_z); + + int i = 0; + + // This for loop performs 16 accumulations + for (; i <= ((int)COLS_A - 16); i += 16) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i); + + sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + + CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)); + } + + // This for loop performs the leftover accumulations + for (; i < COLS_A; ++i) + { + sum_row += (ACC_DATA_TYPE)matrix_a[i]; + } + + sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3; + +#if defined(SCALAR) + sum_row *= (int)SCALAR; +#endif // defined(SCALAR) + *((__global int *)dst.ptr) = (int)sum_row; +} + +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A + * using the arm dot product instruction. It is also possible to multiply each reduced row by a + * scalar value, if SCALAR is passed at compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + ACC_DATA_TYPE sum_row = 0; + + __global const DATA_TYPE *matrix_a = + (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + + get_global_id(1) * src_stride_z); + + int i = 0; + + // This for loop performs 16 accumulations + for (; i <= ((int)COLS_A - 32); i += 32) + { + VEC_DATA_TYPE(DATA_TYPE, 16) + a0 = vload16(0, matrix_a + i); + + sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + + a0 = vload16(1, matrix_a + i); + + sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1)); + } + + // This for loop performs the leftover accumulations + for (; i < COLS_A; ++i) + { + sum_row += (ACC_DATA_TYPE)matrix_a[i]; + } + +#if defined(SCALAR) + sum_row *= (int)SCALAR; +#endif // defined(SCALAR) + *((__global int *)dst.ptr) = (int)sum_row; +} +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#endif // defined(COLS_A) + +#if defined(COLS_B) && defined(ROWS_B) +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of + * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is + * passed at compile time. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + * + * @attention The number of matrix B columns and rows needs to be passed at compile time using + * -DCOLS_B and -DROWS_B + * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. + * -DDATA_TYPE=uchar) + * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE + * (i.e. -DACC_DATA_TYPE=uint) + * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. + * -DSCALAR=3) + * + * @param[in] src_ptr Pointer to the source tensor. Supported data type: + * QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: S32 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in + * bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * tensor + */ +__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst)) +{ + // Compute source and destination addresses + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 16) + sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0; + + __global const DATA_TYPE *matrix_b = + (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z); + + int i = 0; + // This for loop performs 4 accumulations + for (; i <= ((int)ROWS_B - 4); i += 4) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y); + const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y); + + sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) + + CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)); + + matrix_b += 4 * src_stride_y; + } + + // This for loop perfoms the leftover accumulations + for (; i < (int)ROWS_B; ++i) + { + const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b); + + sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)); + + matrix_b += src_stride_y; + } + +#if defined(SCALAR) + sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR; +#endif // defined(SCALAR) + VSTORE(16) + (convert_int16(sum_col_32), 0, (__global int *)dst.ptr); +} +#endif // defined(COLS_B) && defined(ROWS_B) + +#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE) + +#if defined(K_OFFSET) + +/* Helper function used to calculate the offset contribution after matrix multiplication. + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), + * and calculates the offset contribution of matrix A and matrix B. + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * @param[in] x get_global_id(0) * 4 + * @param[in] y get_global_id(1) + * @param[in] z get_global_id(2) + * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + */ +inline int4 offset_contribution(int x, int y, int z +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS) +) +{ + int4 a_offset_s32 = (int4)0; + int4 b_offset_s32 = (int4)0; + + int batch_id = z; +#if defined(DEPTH_INPUT3D) + batch_id /= (int)DEPTH_INPUT3D; +#endif // defined(DEPTH_INPUT3D) + +#if defined(A_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_col_addr = + sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int); + + // Compute the offset contribution due to A_OFFSET +#if defined(SUM_COL_HAS_BATCHES) + a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y)); +#else // defined(SUM_COL_HAS_BATCHES) + a_offset_s32 = vload4(0, (__global int *)sum_col_addr); +#endif // defined(SUM_COL_HAS_BATCHES) + + a_offset_s32 *= (int4)A_OFFSET; +#endif // defined(A_OFFSET) + +#if defined(B_OFFSET) + // Compute the offset contribution due to A_OFFSET + __global uchar *sum_row_addr = + sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int); + + // Compute the offset contribution due to B_OFFSET +#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D); +#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y))); +#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) + b_offset_s32 *= (int4)B_OFFSET; +#endif // defined(B_OFFSET) + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + b_offset_s32 += (int4)biases_values; +#endif // defined(ADD_BIAS) + + return (int4)K_OFFSET + a_offset_s32 + b_offset_s32; +} + +/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is + * performed in-place + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * @param[in] mm_result_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of elements along + * X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of elements along + * Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of elements along + * Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source tensor. + * Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X + * dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in + * the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + */ +__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + VECTOR_DECLARATION(biases) +#endif // defined(ADD_BIAS)) +) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // Store the result with the offset contribution + vstore4(in_s32, 0, (__global int *)mm_result_addr); +} + +#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \ + defined(OUTPUT_DATA_TYPE) +/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and + * it quantizes down to uint8. + * + * This kernel takes a final int32 accumulator value (the output of + * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and + * quantizes to uint8 through the output stage. + * + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The result before the output stage is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * This result is quantized down to uint8/int8 using the output stage. The output stage computes the + * following operations: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result (if -DADD_BIAS is passed at compile time) + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND + * are passed at compile time) + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] mm_result_ptr Pointer to the source tensor. + * Supported data type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of + * elements along Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in + * the source tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor + * Supported data type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in + * X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] dst_step_z src_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) + , +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), + VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) +) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // -------------- OUTPUT STAGE + + // Add the offset terms to GEMM's result + in_s32 += (int4)RESULT_OFFSET; + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = + result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); + int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); + int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); + + in_s32 *= result_multipliers_values; + in_s32 >>= result_shifts_values; +#else // defined(PER_CHANNEL_QUANTIZATION) + in_s32 *= RESULT_MULTIPLIER; + + in_s32 >>= RESULT_SHIFT; +#endif // defined(PER_CHANNEL_QUANTIZATION) + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} + +/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes + * down to uint8. + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to + * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output + * stage. + * + * + * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) + * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200) + * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at + * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1) + * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at + * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6) + * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually + * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches + * + * The result before the output stage is: + * + * mm_result[i][k] = mm_result[i][k] + + * (sum_col[k] * A_OFFSET) + + * (sum_row[i] * B_OFFSET) + + * (K_OFFSET) + * + * This result is quantized down to uint8/int8 using the output stage. The output stage computes the + * following operations: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] mm_result_ptr Pointer to the source tensor. + * Supported data type: S32 + * @param[in] mm_result_stride_x Stride of the source tensor in X + * dimension (in bytes) + * @param[in] mm_result_step_x mm_result_stride_x * number of + * elements along X processed per workitem(in bytes) + * @param[in] mm_result_stride_y Stride of the source tensor in Y + * dimension (in bytes) + * @param[in] mm_result_step_y mm_result_stride_y * number of + * elements along Y processed per workitem(in bytes) + * @param[in] mm_result_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] mm_result_step_z mm_result_stride_z * number of + * elements along Z processed per workitem(in bytes) + * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in + * the source tensor + * @param[in] sum_col_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_col_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_col_step_x (Optional) sum_col_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_col_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_col_step_y (Optional) sum_col_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] sum_row_ptr (Optional) Pointer to the source + * tensor. Supported data type: same as @p mm_result_ptr + * @param[in] sum_row_stride_x (Optional) Stride of the source + * tensor in X dimension (in bytes) + * @param[in] sum_row_step_x (Optional) sum_row_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] sum_row_stride_y (Optional) Stride of the source + * tensor in Y dimension (in bytes) + * @param[in] sum_row_step_y (Optional) sum_row_stride_y * number + * of elements along Y processed per workitem(in bytes) + * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first + * element in the source tensor + * @param[in] biases_ptr (Optional) Pointer to the biases + * tensor. Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases + * tensor in X dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number + * of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first + * element in the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor + * Supported data type: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in + * X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in + * Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements + * along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z + * dimension (in bytes) + * @param[in] dst_step_z src_stride_z * number of elements + * along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in + * the destination tensor + * @param[in] result_multipliers_ptr (Optional) Pointer to the output + * multipliers vector for per-channel quantization. Supported data types: S32 + * @param[in] result_multipliers_stride_x (Optional) Stride of the output + * multipliers vector in X dimension (in bytes) + * @param[in] result_multipliers_step_x (Optional) + * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output multipliers vector + * @param[in] result_shifts_ptr (Optional) Pointer to the output + * shifts vector for per-channel quantization. Supported data types: S32 + * @param[in] result_shifts_stride_x (Optional) Stride of the output + * shifts vector in X dimension (in bytes) + * @param[in] result_shifts_step_x (Optional) output_shifts_stride_x * + * number of elements along X processed per workitem(in bytes) + * @param[in] result_shifts_offset_first_element_in_bytes (Optional) The offset of the first + * element in the output shifts vector + */ +__kernel void + gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result) +#if defined(A_OFFSET) + , + IMAGE_DECLARATION(sum_col) +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + IMAGE_DECLARATION(sum_row) +#endif // defined(B_OFFSET) + , +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst) +#if defined(PER_CHANNEL_QUANTIZATION) + , + VECTOR_DECLARATION(result_multipliers), + VECTOR_DECLARATION(result_shifts) +#endif // defined(PER_CHANNEL_QUANTIZATION) + ) +{ + const int x = get_global_id(0) * 4; + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Compute offset contribution + int4 offset_term_s32 = offset_contribution( + x, y, z +#if defined(A_OFFSET) + , + sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y, + sum_col_offset_first_element_in_bytes +#endif // defined(A_OFFSET) +#if defined(B_OFFSET) + , + sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y, + sum_row_offset_first_element_in_bytes +#endif // defined(B_OFFSET) +#if defined(ADD_BIAS) + , + biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes +#endif // defined(ADD_BIAS) + ); + + __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + + x * sizeof(int) + y * mm_result_stride_y + + z * mm_result_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 in_s32 = vload4(0, (__global int *)mm_result_addr); + + // Add the offset terms to GEMM's result + in_s32 += offset_term_s32; + + // -------------- OUTPUT STAGE + + // Multiply by result_mult_int and shift +#if defined(PER_CHANNEL_QUANTIZATION) + __global uchar *result_multipliers_addr = + result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); + __global uchar *result_shifts_addr = + result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); + int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr); + int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr); + + int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + in_s32, result_multipliers_values, result_shifts_values, 4); + int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + in_s32, result_multipliers_values, result_shifts_values, 4); + in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); +#else // defined(PER_CHANNEL_QUANTIZATION) + +#if RESULT_SHIFT < 0 + in_s32 = + ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + +#endif // defined(PER_CHANNEL_QUANTIZATION) + + // Add the offset terms to GEMM's result + in_s32 += (int4)RESULT_OFFSET; + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && + // defined(OUTPUT_DATA_TYPE) + +#endif // defined(K_OFFSET) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value and processes it to obtain the final + * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result (if -DADD_BIAS is passed at compile time) + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND + * are passed at compile time) + * -# Clamp the resulting int32 values: + * -# - to the [0..255] range and cast to QASYMM8. + * -# - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Add the offset terms to GEMM's result + input_values += (int4)RESULT_OFFSET; + + // Multiply by result_mult_int and shift + input_values *= RESULT_MULT_INT; + +#if RESULT_SHIFT < 0 + input_values >>= -RESULT_SHIFT; +#else // RESULT_SHIFT >= 0 + input_values >>= RESULT_SHIFT; +#endif // RESULT_SHIFT < 0 + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) + +#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \ + defined(RESULT_SHIFT) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be + * performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER + * and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8/QASYMM8_SIGNED + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + + // Add the offset terms to GEMM's result + input_values += (int4)RESULT_OFFSET_AFTER_SHIFT; + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && + // defined(RESULT_SHIFT) + +#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) + +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QSYMM16 value. The following computations will be performed by + * the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16. + * + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor + * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr (Optional) Pointer to the biases tensor. + * Supported data type: same as @p src_ptr + * @param[in] biases_stride_x (Optional) Stride of the biases tensor in X + * dimension (in bytes) + * @param[in] biases_step_x (Optional) biases_stride_x * number of elements + * along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in + * the biases tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QSYMM16 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) + TENSOR3D_DECLARATION(dst)) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Multiply by result_mult_int and shift +#if RESULT_SHIFT < 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#else // RESULT_SHIFT >= 0 + input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE( + input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4); +#endif // RESULT_SHIFT < 0 + + short4 res = convert_short4_sat(input_values); + +#if defined(MIN_BOUND) + res = max(res, (short4)MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (short4)MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global short *)dst_addr); +} +#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) + +#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) +/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to + * QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and + * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be + * performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by + * result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Requantize + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + * + * @attention The offset and scalar scale factor must be passed at compile time using + * -DRESULT_OFFSET, -DREAL_MULTIPLIER + * + * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile + * time + * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE + * @note In case the clamping of the result is required, the min and max bounds can be passed at + * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified + * linear unit" activation functions + * + * @param[in] src_ptr Pointer to the source tensor. Supported data + * type: S32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source + * tensor + * @param[in] biases_ptr Pointer to the biases tensor. Supported data + * type: same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in + * bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases + * tensor + * @param[out] dst_ptr Pointer to the destination tensor Supported data + * type: QASYMM8 + * @param[in] dst_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in + * bytes) + * @param[in] dst_step_z src_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in + * bytes) + * @param[in] dst_step_w src_stride_w * number of elements along W + * processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + */ +__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src), +#if defined(ADD_BIAS) + VECTOR_DECLARATION(biases), +#endif // defined(ADD_BIAS) +#if defined(DST_HEIGHT) + TENSOR4D_DECLARATION(dst)) +#else // defined(DST_HEIGHT) + TENSOR3D_DECLARATION(dst)) +#endif // defined(DST_HEIGHT) +{ + // Compute source and destination addresses + int x = get_global_id(0) * 4; + int y = get_global_id(1); + int z = get_global_id(2); + + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + + y * src_stride_y + z * src_stride_z; + + __global uchar *dst_addr = + dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; + + int4 input_values = vload4(0, (__global int *)src_addr); + +#if defined(ADD_BIAS) + // Add bias + __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); + + int4 biases_values = vload4(0, (__global int *)bias_addr); + input_values += (int4)biases_values; +#endif // defined(ADD_BIAS) + + // Convert to float + float4 input_values_f = convert_float4(input_values); + input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); + + VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4) + res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)); + +#if defined(MIN_BOUND) + res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND); +#endif // defined(MAX_BOUND) + + // Store the result + vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr); +} +#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl new file mode 100644 index 000000000..51919c8a5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2018-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants + +/** Fill the tensor's planes with all value + * @attention The following variables must be passed at compile time: + * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes + * -# -DVEC_SIZE = Vector size + * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might + * need to step back a bit) + * + * @param[in] tensor_ptr Pointer to the source image. Data types + * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32. + * @param[in] tensor_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] tensor_step_x tensor_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] tensor_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] tensor_step_y tensor_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source + * image + * @param[in] value The value used to fill the pages of the tensor + */ +__kernel void memset(TENSOR3D_DECLARATION(tensor)) +{ + Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor); + +#if defined(VEC_SIZE) + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x; +#endif // defined(LAST_ACCESSED_X) + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = (DATA_TYPE)(CONSTANT_VALUE); + + VSTORE(VEC_SIZE) + (data, 0, (__global DATA_TYPE *)tensor.ptr); +#else // !defined(VEC_SIZE) + *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE); +#endif // defined(VEC_SIZE) +} + +#endif // Check for compile time constants diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl new file mode 100644 index 000000000..96f2f9ef0 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \ + defined(SRC_WIDTH) + +#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE) +#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE) + +#if defined(CONST_VAL) +/** Perform a pad operation when PaddingMode is CONSTANT + * + * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4 + * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, + * e.g. -DCONST_VAL=1.27 + * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. + * -DPAD_X_BEFORE=5 + * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. + * -DSRC_WIDTH=224 + * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile + * flag, e.g. -DSELECT_DT=float + * @note In case pad left is more than the vector size, the number of threads to skip along the X + * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g. + * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE) + * @note If pad also needs to be added to the top of the tensor, the following compile flags must be + * passed at compile time: + * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3) + * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127) + * @note If pad also needs to be added to the depth of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. + * -DPAD_Z_BEFORE=3) + * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32) + * @note If pad also needs to be added to the batch of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. + * -DPAD_W_BEFORE=3) + * -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: + * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source image in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination image in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * image + * @param[in] batch (Optional) Batch index if 4D pad must be applied + */ +__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst) +#if defined(PAD_W_BEFORE) + , + uint batch +#endif // defined(PAD_W_BEFORE) +) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + uint cond = 0; + +#if defined(PAD_W_BEFORE) + cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE); +#endif // defined(PAD_W_BEFORE) +#if defined(PAD_Z_BEFORE) + cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE); +#endif // defined(PAD_Z_BEFORE) + + if (cond) + { + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + VSTORE(VEC_SIZE) + ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr); + } + else + { + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#if defined(NUM_THREADS_TO_SKIP_X) + /* In case the pad left is greater than the vector size, and we are past the threads operating + * solely on pad values, the input pointer must be brought back along the X axis to start from + * the first non-pad values. + * + * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|: + * -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1)) + * < PAD_X_BEFORE + * -# The second thread will compute the output values |0 0| since it detects (x_outs == (2, + * 3)) < PAD_X_BEFORE + * -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x * + * VEC_SIZE) == 4) values, reading |4 5| + * -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and + * check that it is >= to the current x + * -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying + * this constant by the input's step along the X axis + * -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it + * will read the desired values |0 1| + */ + src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X); +#endif // defined(NUM_THREADS_TO_SKIP_X) +#if defined(PAD_Z_BEFORE) + src.ptr -= PAD_Z_BEFORE * src_step_z; +#endif // defined(PAD_Z_BEFORE) +#if defined(PAD_W_BEFORE) + src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z; +#endif // defined(PAD_W_BEFORE) + + VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); + + VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT); + VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE); +#if defined(PAD_Y_BEFORE) + cond |= + (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE); +#endif // defined(PAD_Y_BEFORE) + VSTORE(VEC_SIZE) + (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0, + (__global DATA_TYPE *)dst.ptr); + } +} +#endif // defined(CONST_VAL) + +#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && \ + defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \ + defined(AFTER_PAD_FACT_X) + +#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1 +#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n) +#define SYMM_REFL_LEFT(x, n0, n1) \ + select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0) +#define SYMM_REFL_RIGHT(x, n0, n1) \ + select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0) + +/** Perform a pad operation when PaddingMode is SYMMETRIC + * + * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4 + * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27 + * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. + * -DPAD_X_BEFORE=5 + * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. + * -DSRC_WIDTH=224 + * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile + * flag, e.g. -DSELECT_DT=float + * @note Number of values to the left when operating across left padding must be passed using the + * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5 + * @note Number of values to the left when operating across right padding must be passed using the + * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6 + * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is + * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6 + * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the + * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5 + * @note When after pad X, starting point to read backward from must be passed using the + * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253 + * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be + * set to 0 + * @note If pad also needs to be added to the top of the tensor, the following compile flags must be + * passed at compile time: + * -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3) + * -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127) + * @note If pad also needs to be added to the depth of the tensor, the following compile flags must + * be passed at compile time: + * -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. + * -DPAD_Z_BEFORE=3) + * -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32) + * @note If the starting point to read backward from is less than the output's last element accessed + * in the X, the following compile flags must be passed at compile time to avoid negative offsets: + * -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation + * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: + * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in + * bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in + * bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] src_stride_z Stride of the source image in Z dimension (in + * bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data + * types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in + * bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed + * per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in + * bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed + * per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination image in Z dimension (in + * bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed + * per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination + * image + */ +__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)) +{ + // Get current thread position + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Define conditions based on the thread X position w.r.t. pad left and right + const int x_out_first = x * VEC_SIZE; + const int x_out_last = x_out_first + VEC_SIZE; + const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE); + const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE); + const int is_inside_input = + (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE)); + const int is_across_pad_right = + (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE)); + const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE)); + + // Calculate base pointers + __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes; + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + // Calculate input tensor's offset based on the defined conditions + int x_offset = 0; + x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left); + x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input); + x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right); + x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right); + +#if defined(AFTER_PAD_REM) + int neg_offs = x_offset < 0; + x_offset = max(x_offset, 0); +#endif // defined(AFTER_PAD_REM) + + // Load input values from the computed offset + int y_in = y; + int z_in = z; +#if defined(PAD_Y_BEFORE) + y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE); + y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, + y >= (SRC_HEIGHT + PAD_Y_BEFORE)); +#endif // defined(PAD_Y_BEFORE) +#if defined(PAD_Z_BEFORE) + z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE); + z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, + z >= (SRC_DEPTH + PAD_Z_BEFORE)); +#endif // defined(PAD_Y_BEFORE) + + src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z; + +#if SRC_WIDTH == 1 + VSTORE(VEC_SIZE) + ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr); +#else // SRC_WIDTH == 1 + + VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr); + + // Choose rearrangement policy based on the defined conditions + src_vals = + select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), + SCALAR_COND(is_across_pad_left)); + src_vals = + select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), + SCALAR_COND(is_across_pad_right)); + src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE), + SCALAR_COND((is_before_pad_left || is_after_pad_right))); +#if defined(AFTER_PAD_REM) + src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs)); +#endif // defined(AFTER_PAD_REM) + + // Store + VSTORE(VEC_SIZE) + (src_vals, 0, (__global DATA_TYPE *)dst.ptr); +#endif // SRC_WIDTH == 1 +} +#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) && + // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && + // defined(AFTER_PAD_FACT_X) +#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && + // defined(SRC_WIDTH) diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h new file mode 100644 index 000000000..cfc811cce --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_REPEAT_H +#define ARM_COMPUTE_REPEAT_H + +#include "helpers.h" + +/** Macros that help in loop unrolling */ +// Repeat macros with 3 param, excluding the implicit ID param +#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) +#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ + P_X##_DEF(1, P_A, P_B, P_C); \ + REPEAT_3_1(P_X, P_A, P_B, P_C) +#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ + P_X##_DEF(2, P_A, P_B, P_C); \ + REPEAT_3_2(P_X, P_A, P_B, P_C) +#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ + P_X##_DEF(3, P_A, P_B, P_C); \ + REPEAT_3_3(P_X, P_A, P_B, P_C) +#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ + P_X##_DEF(4, P_A, P_B, P_C); \ + REPEAT_3_4(P_X, P_A, P_B, P_C) +#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ + P_X##_DEF(5, P_A, P_B, P_C); \ + REPEAT_3_5(P_X, P_A, P_B, P_C) +#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ + P_X##_DEF(6, P_A, P_B, P_C); \ + REPEAT_3_6(P_X, P_A, P_B, P_C) +#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ + P_X##_DEF(7, P_A, P_B, P_C); \ + REPEAT_3_7(P_X, P_A, P_B, P_C) +#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ + P_X##_DEF(8, P_A, P_B, P_C); \ + REPEAT_3_8(P_X, P_A, P_B, P_C) +#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ + P_X##_DEF(9, P_A, P_B, P_C); \ + REPEAT_3_9(P_X, P_A, P_B, P_C) +#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ + P_X##_DEF(A, P_A, P_B, P_C); \ + REPEAT_3_10(P_X, P_A, P_B, P_C) +#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ + P_X##_DEF(B, P_A, P_B, P_C); \ + REPEAT_3_11(P_X, P_A, P_B, P_C) +#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ + P_X##_DEF(C, P_A, P_B, P_C); \ + REPEAT_3_12(P_X, P_A, P_B, P_C) +#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ + P_X##_DEF(D, P_A, P_B, P_C); \ + REPEAT_3_13(P_X, P_A, P_B, P_C) +#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ + P_X##_DEF(E, P_A, P_B, P_C); \ + REPEAT_3_14(P_X, P_A, P_B, P_C) +#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ + P_X##_DEF(F, P_A, P_B, P_C); \ + REPEAT_3_15(P_X, P_A, P_B, P_C) + +#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \ + REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion + // does not affect preprocessing P_NUM +#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) + +// Repeat macros with 4 param, excluding the implicit ID param +#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) +#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(1, P_A, P_B, P_C, P_D); \ + REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(2, P_A, P_B, P_C, P_D); \ + REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(3, P_A, P_B, P_C, P_D); \ + REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(4, P_A, P_B, P_C, P_D); \ + REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(5, P_A, P_B, P_C, P_D); \ + REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(6, P_A, P_B, P_C, P_D); \ + REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(7, P_A, P_B, P_C, P_D); \ + REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(8, P_A, P_B, P_C, P_D); \ + REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(9, P_A, P_B, P_C, P_D); \ + REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(A, P_A, P_B, P_C, P_D); \ + REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(B, P_A, P_B, P_C, P_D); \ + REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(C, P_A, P_B, P_C, P_D); \ + REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(D, P_A, P_B, P_C, P_D); \ + REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(E, P_A, P_B, P_C, P_D); \ + REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) +#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ + P_X##_DEF(F, P_A, P_B, P_C, P_D); \ + REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) + +#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \ + REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of + // expansion does not affect preprocessing P_NUM +#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) + +// Macro for initializing N variables. Generates N statements that defines VAR##N = +// RHS_ACCESSOR_DEF(...) +#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL +#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) + +// Macro for initializing N variables by converting the data type. Generates N statements that +// defines VAR##N = RHS_ACCESSOR_DEF(...) +#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \ + TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) +#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \ + REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) + +// Macro for adding a constant to N variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL +#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) + +// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables +// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...) +#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL +#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \ + REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) + +// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC +#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) + +// Macro for adding a two N-variables. Generates N statements that defines VAR##N +// =RHS_ACCESSOR_DEF(...) +#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID +#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) + +// Macro for performing Max between a constant and N variables. Generates N statements that defines +// VAR##N =RHS_ACCESSOR_DEF(...) +#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) +#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) + +// Macro for performing Min between a constant and N variables. Generates N statements that defines +// VAR##N =RHS_ACCESSOR_DEF(...) +#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) +#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) + +// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N +// statements that defines VAR##N =RHS_ACCESSOR_DEF(...) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) + +// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N +// statements that defines VAR##N =RHS_ACCESSOR_DEF(...) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) + +// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables. +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ + ({ \ + VEC_DATA_TYPE(int, N0) \ + VAR##ID_shift_lt0 = \ + ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ + VEC_DATA_TYPE(int, N0) \ + VAR##ID_shift_gt0 = \ + ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ + VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ + }) +#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \ + REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) + +#endif // ARM_COMPUTE_REPEAT_H diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl new file mode 100644 index 000000000..8da8bfc8e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Perform tensor reshape + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. + * -DDATA_TYPE=short + * + * @param[in] input_ptr Pointer to the first source tensor. Supported + * data types: All + * @param[in] input_stride_x Stride of the first source tensor in X dimension + * (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension + * (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension + * (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first + * source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported + * data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension + * (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X + * processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension + * (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y + * processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension + * (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z + * processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the + * destination tensor + * @param[in] input_shape Input spatial shape + * @param[in] output_shape Output spatial shape + */ +__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output), + int2 input_shape, int2 output_shape) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + + int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2)); + + // Linearize index + int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y; + + // Translate to output + int3 out_id; + out_id.x = linear_idx % output_shape.x; + out_id.y = (linear_idx / output_shape.x) % output_shape.y; + out_id.z = linear_idx / (output_shape.x * output_shape.y); + + // Store result + *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = + *((__global DATA_TYPE *)in.ptr); +} diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp index 45307fad7..987409739 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp @@ -39,16 +39,18 @@ */ #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp index ffa2c5a67..a5daa2410 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp @@ -43,6 +43,8 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/AccessWindowStatic.h" #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp index 3f2ae357d..dc06bfbb3 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp @@ -41,13 +41,16 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include <cstddef> diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp index e4c617c8d..4206f1fd4 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp index 8b5885225..62da2376e 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp @@ -45,6 +45,10 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/core/UtilsEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp index f0a761b97..03ca6ddcb 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp @@ -43,6 +43,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" using namespace arm_compute; @@ -111,7 +112,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso _hits = hits; // Make _lookup_indices tensor - _lookup_indices = support::cpp14::make_unique<CLTensor>(); + _lookup_indices = std::make_unique<CLTensor>(); _lookup_indices->allocator()->init( TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32)); _lookup_indices->allocator()->allocate(); diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp index dab6480b2..945af3c51 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp @@ -42,12 +42,16 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include "support/ToolchainSupport.h" diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp new file mode 100644 index 000000000..a00fc5e2e --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2018-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" + +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {} + +void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window) +{ + configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window); +} + +void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, + const PixelValue &constant_value, Window *window) +{ + ARM_COMPUTE_UNUSED(compile_context); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window)); + + _tensor = tensor; + + const DataType data_type = tensor->info()->data_type(); + const int vec_size_x = 16 / tensor->info()->element_size(); + + // Create and update the window (if needed) + _full_window = calculate_max_window(*tensor->info()); + Window win = _full_window; + if (window != nullptr) + { + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); + win = *window; + } + + const int output_width_x = win.num_iterations(0); + const bool multi_access_x = output_width_x >= vec_size_x; + const bool remainder_x = output_width_x % vec_size_x > 0; + + if (multi_access_x) + { + win.set( + Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max<int>(output_width_x - vec_size_x, 0))); + + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options())); +} + +Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, + Window *window) +{ + ARM_COMPUTE_UNUSED(tensor); + ARM_COMPUTE_UNUSED(constant_value); + if (window != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); + } + return Status{}; +} + +void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Collapse all the batches on the third + Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _tensor, slice); + enqueue(queue, *this, slice, lws_hint()); + } while (collapsed.slide_window_slice_3D(slice)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp index 1d4b141a7..da7437e97 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp @@ -40,15 +40,19 @@ #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp index ee633d437..cd5e571e9 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp index 0b8e7cc41..4c4cbe710 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp @@ -42,6 +42,10 @@ #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include <string> namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp new file mode 100644 index 000000000..b6efeac35 --- /dev/null +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" + +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(constant_value); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions()); + if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC) + { + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3); + + const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); + for (size_t i = 0; i < padding.size(); ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect)); + ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect)); + } + } + + if (output->total_size() > 0) + { + TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); + } + + return Status{}; +} + +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode, + unsigned int &num_elems_processed_per_iteration) +{ + ARM_COMPUTE_UNUSED(constant_value, mode); + + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); + auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape)); + + num_elems_processed_per_iteration = + std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type()))); + if (input->dimension(0) < num_elems_processed_per_iteration) + { + num_elems_processed_per_iteration = + 1 << static_cast<unsigned int>(std::log2(input->dimension(0))); + } + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + + const int input_start_x = + mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0; + const int input_start_y = + (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0; + + AccessWindowRectangle input_access(input, input_start_x, input_start_y, + num_elems_processed_per_iteration, 1); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + const bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLPadLayerKernelEx::CLPadLayerKernelEx() + : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false) +{ +} + +void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, + mode); +} + +void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input, + ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_UNUSED(compile_context); + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), output->info(), padding, constant_value, mode)); + + _input = input; + _output = output; + _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3); + + // Configure window + unsigned int vec_size; + auto win_config = validate_and_configure_window(input->info(), output->info(), padding, + constant_value, mode, vec_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set build options + std::string kernel_name = "pad_layer_"; + + const DataType &data_type = input->info()->data_type(); + const unsigned int input_width = input->info()->dimension(0); + const unsigned int input_height = input->info()->dimension(1); + const unsigned int input_depth = input->info()->dimension(2); + const unsigned int pad_x_before = padding.at(0).first; + const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0; + const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0; + const unsigned int pad_right_start = input_width + pad_x_before; + + _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0; + _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0; + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); + if (padding.size() > 1) + { + build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); + + if (padding.size() > 2) + { + build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before)); + build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth)); + } + } + + switch (mode) + { + case PaddingMode::CONSTANT: + { + kernel_name += "constant"; + + build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type)); + build_opts.add_option_if(pad_x_before >= vec_size, + "-DNUM_THREADS_TO_SKIP_X=" + + support::cpp11::to_string(pad_x_before / vec_size)); + + if (_4d_enabled) + { + build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first)); + build_opts.add_option("-DSRC_BATCH=" + + support::cpp11::to_string(input->info()->dimension(3))); + } + + break; + } + case PaddingMode::SYMMETRIC: + case PaddingMode::REFLECT: + { + kernel_name += "symmetric_reflect"; + + const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT); + + const unsigned int pad_x_before_remainder = pad_x_before % vec_size; + const unsigned int pad_x_after_remainder = pad_right_start % vec_size; + const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect; + const unsigned int output_last_x = + ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size); + + build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect)); + build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + + support::cpp11::to_string(pad_x_before_remainder)); + build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + + support::cpp11::to_string(pad_x_after_remainder)); + build_opts.add_option( + "-DPAD_X_BEFORE_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size)); + build_opts.add_option( + "-DPAD_X_AFTER_REMAINDER_REFL=" + + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size)); + build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x)); + build_opts.add_option_if(after_pad_fact_x < output_last_x, + "-DAFTER_PAD_REM=" + + support::cpp11::to_string(after_pad_fact_x % vec_size)); + + break; + } + default: + ARM_COMPUTE_ERROR("Padding mode not supported."); + } + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options())); +} + +Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + unsigned int vec_size; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + output->clone().get(), padding, + constant_value, mode, vec_size) + .first); + + return Status{}; +} + +void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window win_in = window; + win_in.adjust(Window::DimX, _input_start_x, true); + win_in.adjust(Window::DimY, _input_start_y, true); + + Window slice_out = window.first_slice_window_3D(); + Window slice_in = win_in.first_slice_window_3D(); + unsigned int batch = 0; + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice_in); + add_3D_tensor_argument(idx, _output, slice_out); + if (_4d_enabled) + { + add_argument<unsigned int>(idx, batch++); + } + + enqueue(queue, *this, slice_out, lws_hint()); + } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in)); +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp index b417a7103..9aa815f55 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp @@ -40,15 +40,19 @@ #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h" -#include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" -#include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp index 3906009c2..70374ba61 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp @@ -43,6 +43,9 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" + +#include "src/core/helpers/WindowHelpers.h" + #include "support/StringSupport.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp index 4a6374444..c9d6dc31c 100644 --- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp +++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp @@ -40,7 +40,7 @@ #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h" -#include "arm_compute/core/AccessWindowStatic.h" +#include "src/core/AccessWindowStatic.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibraryEx.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -48,6 +48,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include "support/StringSupport.h" #include <climits> diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp index c88bef6d7..1d4d33ac2 100644 --- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp +++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp @@ -42,7 +42,7 @@ #include <algorithm> #include "arm_compute/core/Types.h" -#include "arm_compute/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEAsymm.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp index a8464afce..0551fc7db 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp @@ -43,10 +43,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <algorithm> #include <arm_neon.h> @@ -163,7 +163,7 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out, std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func( const ITensor *input1, const ITensor *input2, ITensor *output, - std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function) + std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *> map_function) { std::string function_to_call("op_"); function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; @@ -185,9 +185,9 @@ template <BinaryLogicalOperation op> std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output) { - static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = { - {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, - {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; + static std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *> + map_function = {{"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}, + {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}}; return configure_func(input1, input2, output, map_function); } @@ -196,7 +196,7 @@ void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const const ITensor *input2, ITensor *output) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info())); - configure_common(input1, input2, output); + configure_common(input1->info(), input2->info(), output->info()); switch (op) { case BinaryLogicalOperation::AND: @@ -251,5 +251,4 @@ Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op, ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output)); return Status{}; } - } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp index f935596e6..87e716b4f 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp @@ -39,16 +39,19 @@ */ #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" +#include "src/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/SaturateCast.h" +#include "support/SaturateCast.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + +#include "src/core/NEON/INEKernel.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp index e3a77c6b1..3ad9ee945 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp @@ -47,6 +47,9 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + using namespace arm_compute; NEEmbeddingLookupKernel::NEEmbeddingLookupKernel() diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 000000000..375fa28e5 --- /dev/null +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2017-2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> +#include <mutex> + +using namespace arm_compute; + +namespace +{ +inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0)); + + return Status{}; +} + +inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, + ITensorInfo *biases) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + bool window_changed = update_window_and_padding( + win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration), + AccessWindowStatic(biases, 0, 0, + ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), + biases->tensor_shape().y())); + + AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration); + + // Set the valid region for the accum tensor + Coordinates coord; + coord.set_num_dimensions(accum->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape())); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); + + _biases = biases; + _accum = accum; + + // Configure kernel window + auto win_config = validate_and_configure_window(accum->info(), biases->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, + const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(accum->clone().get(), biases->clone().get()).first); + + return Status{}; +} + +std::mutex m; +void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info) +{ + std::lock_guard<std::mutex> lock_guard(m); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_biases; + win_biases.set(Window::DimX, + Window::Dimension(window.x().start(), window.x().end(), window.x().step())); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in0_out(_accum, window); + Iterator in1(_biases, win_biases); + + switch (_accum->info()->data_type()) + { + case DataType::F32: + { + execute_window_loop( + window, + [&](const Coordinates &) { + const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr())); + const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr())); + const float32x4x4_t res = { + {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]), + vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}}; + + vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res); + }, + in0_out, in1); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + execute_window_loop( + window, + [&](const Coordinates &) { + const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr())); + const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr())); + const float16x8x2_t res = { + {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}}; + + vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res); + }, + in0_out, in1); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } +} diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp index c9f0799d4..d4144e6b9 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp @@ -40,7 +40,7 @@ #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -50,6 +50,9 @@ #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + namespace arm_compute { namespace diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp index 52b40e767..f178865b7 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp @@ -47,6 +47,9 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <unordered_map> using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp index 4dc0f5535..7804f9c6a 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp @@ -40,17 +40,22 @@ #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/NEMath.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <arm_neon.h> namespace arm_compute diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp index ad4728175..8ad998313 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp @@ -42,13 +42,15 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp index 0daff5c6a..e56fbf7f3 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp @@ -38,7 +38,7 @@ * SOFTWARE. */ #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -47,6 +47,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" + namespace arm_compute { namespace diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp index 2306228d5..420e5063c 100644 --- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp +++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp @@ -42,13 +42,16 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/NEON/NEAsymm.h" -#include "arm_compute/core/NEON/wrapper/wrapper.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/CPP/Validate.h" +#include "src/core/CPP/Validate.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp index b02a48ef2..6b9b0d4b4 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp @@ -45,7 +45,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/Utils.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/runtime/Utils.h" namespace arm_compute { @@ -66,7 +68,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const unsigned int num_of_stages = - calculate_number_of_stages_only_x_axis(input->dimension(0), axis); + utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis); DataType output_data_type = DataType::S32; TensorInfo not_reshaped_output; @@ -132,7 +134,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate( input, &sums_vector[last_stage - 1], ¬_reshaped_output, axis, op)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(¬_reshaped_output, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(¬_reshaped_output, output)); return Status{}; } @@ -140,7 +142,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * const ReductionOperation &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); + _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis); _reduction_axis = axis; const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape( @@ -204,7 +206,8 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor * &_not_reshaped_output, axis, op); _results_vector[last_stage - 1].allocator()->allocate(); } - _reshape_kernel.configure(&_not_reshaped_output, output); + _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output, + output); _not_reshaped_output.allocator()->allocate(); } @@ -216,6 +219,6 @@ void CLArgMinMaxLayerEx::run() { CLScheduler::get().enqueue(_reduction_kernels_vector[i], false); } - CLScheduler::get().enqueue(_reshape_kernel, false); + _reshape_kernel.run(); } } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp index e5122ab8f..31c96b080 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp @@ -42,13 +42,14 @@ #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" using namespace arm_compute; void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, BinaryLogicalOperation op) { - auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>(); + auto k = std::make_unique<CLBinaryLogicalOpKernel>(); k->configure(input1, input2, output, op); _kernel = std::move(k); @@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; if (broadcasted_info->info()->dimension(0) == 1) { - _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); } } } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp index c7d0ac8e2..96f9c17a9 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp @@ -46,7 +46,7 @@ using namespace arm_compute; void CLCastBool::configure(ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>(); + auto k = std::make_unique<CLCastBoolKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp index 6359b4bcb..464f60dee 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -45,6 +45,8 @@ #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" + #include <memory> #include <tuple> diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp index ae9d8afc6..003ec8042 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp @@ -39,7 +39,6 @@ */ #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h" - #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h" using namespace arm_compute; @@ -47,7 +46,7 @@ using namespace arm_compute; void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups) { - auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>(); + auto k = std::make_unique<CLEmbeddingLookupKernel>(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index 79d0929a9..af936e873 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -45,7 +45,6 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" #include <algorithm> @@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) { - auto k = support::cpp14::make_unique<CLTransposeKernel>(); + auto k = std::make_unique<CLTransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index 13d3acbac..c6a88d340 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -42,11 +42,11 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + +#include "support/Cast.h" #include <algorithm> @@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) { - auto k = support::cpp14::make_unique<CLTransposeKernel>(); + auto k = std::make_unique<CLTransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp index ac6982e6f..cda784541 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp @@ -19,6 +19,7 @@ #include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h> #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h> +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 000000000..cd7409417 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Copyright (c) 2017-2020 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "support/StringSupport.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/WindowHelpers.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1); + + return Status{}; +} + +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target, + unsigned int &num_elems_processed_per_iteration) +{ + // Select the vector size to use (8 for Bifrost; 16 for Midgard). + bool is_gpu_bifrost = + gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51, + GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT); + num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16; + + // Configure kernel window + Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access( + biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), + biases->dimension(1)); + AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win, biases_access, accum_access); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) +{ + configure(CLKernelLibrary::get().get_compile_context(), accum, biases); +} + +void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context, + ICLTensor *accum, const ICLTensor *biases) +{ + ARM_COMPUTE_UNUSED(compile_context); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info())); + + _biases = biases; + _accum = accum; + + // Get the target gpu + GPUTarget gpu_target = get_target(); + unsigned int vector_size = 0; + + // Configure kernel window + auto win_config = + validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())); + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options())); +} + +Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, + const ITensorInfo *biases, GPUTarget gpu_target) +{ + unsigned int num_elems_processed_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), + biases->clone().get(), gpu_target, + num_elems_processed_per_iteration) + .first); + + return Status{}; +} + +void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window accum_slice = window.first_slice_window_2D(); + + Window biases_slice(accum_slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_2D_tensor_argument(idx, _accum, accum_slice); + add_1D_tensor_argument(idx, _biases, biases_slice); + + enqueue(queue, *this, accum_slice, lws_hint()); + } while (window.slide_window_slice_2D(accum_slice)); +} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp index e0b833b04..f380e3e2c 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp @@ -41,6 +41,8 @@ #include "arm_compute/runtime/CL/functions/CLGatherEx.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/kernels/CLGatherKernel.h" + #include "arm_compute/core/CL/kernels/CLGatherExKernel.h" using namespace arm_compute; @@ -48,7 +50,7 @@ using namespace arm_compute; void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis) { - auto k = support::cpp14::make_unique<CLGatherExKernel>(); + auto k = std::make_unique<CLGatherExKernel>(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp index 65b89a389..9896abd4b 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp @@ -47,7 +47,7 @@ using namespace arm_compute; void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *input, ICLTensor *output, ICLTensor *hits) { - auto k = support::cpp14::make_unique<CLHashtableLookupKernel>(); + auto k = std::make_unique<CLHashtableLookupKernel>(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp index 5a7e40839..ca45a57f8 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp @@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {} void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output, ICLTensor *gamma, ICLTensor *beta, float epsilon) { - auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>(); + auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>(); k->configure(input, output, gamma, beta, epsilon); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp index 28e5bc0da..2bdc451b3 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp @@ -46,7 +46,7 @@ using namespace arm_compute; void CLNeg::configure(ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>(); + auto k = std::make_unique<CLNegKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp index aa9f32ec6..759a19ff3 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp @@ -41,7 +41,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/kernels/CLOneHotKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "support/MemorySupport.h" + namespace arm_compute { CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {} diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp new file mode 100644 index 000000000..4d940e966 --- /dev/null +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h" +#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" + +namespace arm_compute +{ +CLPadLayerEx::CLPadLayerEx() + : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()), + _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false) +{ +} + +void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, + mode); +} + +void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input, + ICLTensor *output, const PaddingList &padding, + PixelValue constant_value, PaddingMode mode) +{ + ARM_COMPUTE_ERROR_THROW_ON( + validate(input->info(), output->info(), padding, constant_value, mode)); + + _perform_pad = std::any_of(padding.begin(), padding.end(), + [](PaddingInfo info) { return info.first > 0 || info.second > 0; }); + + if (_perform_pad) + { + _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode); + } + else + { + Window copy_window = Window(); + copy_window.use_tensor_dimensions(output->info()->tensor_shape()); + // Copy the input to the whole output if no padding is applied + _copy_kernel->configure(compile_context, input->info(), output->info(), ©_window); + } +} +Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, + const PaddingList &padding, PixelValue constant_value, + PaddingMode mode) +{ + bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { + return info.first > 0 || info.second > 0; + }); + + if (perform_pad) + { + ARM_COMPUTE_RETURN_ON_ERROR( + CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output)); + } + return Status{}; +} +void CLPadLayerEx::run() +{ + if (_perform_pad) + { + CLScheduler::get().enqueue(*_pad_kernel); + } + else + { + CLScheduler::get().enqueue(*_copy_kernel); + } +} +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp index c246041bb..6740835a8 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp @@ -61,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo * ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1); // Create temporary tensor infos - auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors); + auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors); // Create intermediate tensor info TensorShape shape{input->tensor_shape()}; @@ -124,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output, throw std::runtime_error("CLReduceOperation: there is no axis to reduce"); } - _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors); - _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels); + _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors); + _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels); // Set a vector that is ordered ICLTensors sequentially. std::vector<ICLTensor *> tensors; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp index 12c0aa829..73f5f6eb1 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp @@ -47,6 +47,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" #include <cassert> using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index 0754fd813..f3f093c18 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC { case DeconvolutionMethod::DIRECT: { - auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>(); + auto f = std::make_unique<CLDirectTransposeConvLayer>(); f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info); _function = std::move(f); @@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC } case DeconvolutionMethod::GEMM: { - auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); + auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager); f->configure(compile_context, input, weights, bias, output, deconv_info); _function = std::move(f); break; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp index 2fc94b267..e6b7329d1 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp @@ -38,11 +38,10 @@ * SOFTWARE. */ -#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h> +#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h" #include "arm_compute/core/ITensor.h" -#include "support/MemorySupport.h" #include <utility> @@ -53,7 +52,7 @@ template <BinaryLogicalOperation COP> void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output) { - auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + auto k = std::make_unique<NEBinaryLogicalOperationKernel>(); k->configure(COP, input1, input2, output); _kernel = std::move(k); } @@ -69,7 +68,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1, void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output, BinaryLogicalOperation op) { - auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>(); + auto k = std::make_unique<NEBinaryLogicalOperationKernel>(); k->configure(op, input1, input2, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp index 6ad3e1b12..f6eec2603 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp @@ -40,13 +40,12 @@ #include "arm_compute/runtime/NEON/functions/NECastBool.h" #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h" -#include "support/MemorySupport.h" using namespace arm_compute; void NECastBool::configure(const ITensor *input, ITensor *output) { - auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>(); + auto k = std::make_unique<NECastBoolKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp index e0ab3e025..99fc5c579 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp @@ -41,13 +41,12 @@ #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h" #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h" -#include "support/MemorySupport.h" using namespace arm_compute; void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups) { - auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>(); + auto k = std::make_unique<NEEmbeddingLookupKernel>(); k->configure(input, output, lookups); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index e212a03c7..fbd88fff0 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) { - auto k = support::cpp14::make_unique<NETransposeKernel>(); + auto k = std::make_unique<NETransposeKernel>(); k->configure(input, output); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp index a639f2979..758f7dc59 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -50,7 +50,8 @@ #include <algorithm> #include <cmath> -using namespace arm_compute; +namespace arm_compute +{ using namespace arm_compute::misc::shape_calculator; namespace @@ -164,9 +165,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei const ITensor *biases, ITensor *output, FullyConnectedLayerInfo fc_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate( input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info)); @@ -348,7 +348,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor (input->dimension(0) * input->dimension(1) * input->dimension(2)))); // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input)); input_to_use = &flatten_input; } else @@ -374,9 +374,13 @@ void NEFullyConnectedLayerEx::run() if (!_is_prepared) { if (!_are_weights_reshaped) + { _reshape_weights_output.allocator()->allocate(); + } if (!_are_weights_converted) + { _converted_weights_output.allocator()->allocate(); + } _is_prepared = true; } @@ -407,7 +411,7 @@ void NEFullyConnectedLayerEx::run() // Linearize input if it comes from a convolutional layer if (_is_fc_after_conv) { - NEScheduler::get().schedule(&_flatten_kernel, Window::DimY); + _flatten_kernel.run(); } // Run matrix multiply @@ -490,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare() } #endif } +} // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index 234c783f9..2199839fb 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -19,6 +19,8 @@ #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h> #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h> +#include "src/core/helpers/AutoConfiguration.h" +#include <cassert> using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp index 433c35d58..e5607ab9a 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp @@ -41,7 +41,6 @@ #include "arm_compute/runtime/NEON/functions/NEGatherEx.h" #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h" -#include "support/MemorySupport.h" #include <utility> @@ -49,7 +48,7 @@ namespace arm_compute { void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) { - auto k = support::cpp14::make_unique<NEGatherKernelEx>(); + auto k = std::make_unique<NEGatherKernelEx>(); k->configure(input, indices, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp index 52d58accf..7cc6c89e7 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp @@ -41,14 +41,13 @@ #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h" #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h" -#include "support/MemorySupport.h" using namespace arm_compute; void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input, ITensor *output, ITensor *hits) { - auto k = support::cpp14::make_unique<NEHashtableLookupKernel>(); + auto k = std::make_unique<NEHashtableLookupKernel>(); k->configure(lookups, keys, input, output, hits); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp index 275c55024..e0620bad2 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp @@ -39,14 +39,14 @@ */ #include "arm_compute/runtime/NEON/functions/NEOneHot.h" #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h" -#include "support/MemorySupport.h" + #include <utility> namespace arm_compute { void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value, const ITensor *off_value, ITensor *output, int axis) { - auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>(); + auto k = std::make_unique<NEOneHotKernel>(); k->configure(indices, depth, on_value, off_value, output, axis); _kernel = std::move(k); } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp index c45c335b3..a30c00ea1 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp @@ -40,11 +40,13 @@ #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h" -#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/Tensor.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp index b21717e86..7a1342644 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp @@ -40,9 +40,13 @@ #include "arm_compute/runtime/NEON/functions/NEReduceSum.h" -#include "arm_compute/core/CPP/Validate.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute; diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp index 50311071b..4675121b2 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp @@ -44,6 +44,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" using namespace arm_compute::misc::shape_calculator; diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h index 10f3ecbd3..c5dd63b5b 100644 --- a/compute/cker/include/cker/Types.h +++ b/compute/cker/include/cker/Types.h @@ -111,6 +111,8 @@ struct SoftmaxParams int32_t zero_point; float scale; float *table; + uint8_t *uint8_table1; + uint8_t *uint8_table2; }; struct PackParams diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h index f73c01523..9aae0a957 100644 --- a/compute/cker/include/cker/Utils.h +++ b/compute/cker/include/cker/Utils.h @@ -20,6 +20,8 @@ #include "Shape.h" +#include "neon/neon_check.h" + #include <algorithm> #include <cstdint> #include <fixedpoint/fixedpoint.h> @@ -29,6 +31,11 @@ namespace nnfw namespace cker { +template <typename T> struct is_quant8 +{ + static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value; +}; + template <typename T> inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max) { @@ -106,6 +113,34 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); } +#ifdef USE_NEON +inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val, + int32_t quantized_multiplier, int32_t shift) +{ + const int left_shift = std::max(shift, 0); + const int right_shift = std::min(shift, 0); + int32x4x4_t result; + + int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier); + int32x4_t left_shift_dup = vdupq_n_s32(left_shift); + int32x4_t right_shift_dup = vdupq_n_s32(right_shift); + + result.val[0] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[1] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[2] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[3] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup); + + return result; +} +#endif + inline int NodeOffset(int b, int h, int w, int height, int width) { return (b * height + h) * width + w; diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h index a70e39cc9..e10f02ad4 100644 --- a/compute/cker/include/cker/operation/AveragePool.h +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, } } +template <> +void AveragePool<int8_t>(const PoolParams ¶ms, const Shape &input_shape, + const int8_t *input_data, const Shape &output_shape, int8_t *output_data) +{ + // Here, and in other pooling ops, in order to maintain locality of reference, + // to minimize some recalculations, and to load into NEON vector registers, we + // use an inner loop down the depth. Since depths can be large and hence we + // would need arbitrarily large temporary storage, we divide the work up into + // depth tranches just within the batch loop. + static constexpr int kPoolingAccTrancheSize = 256; + + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + int32_t acc[kPoolingAccTrancheSize]; + for (int batch = 0; batch < batches; ++batch) + { + // We proceed through the depth in tranches (see comment above). The + // depth_base is the depth at the beginning of the tranche. The + // tranche_depth is the depth dimension of the tranche. + for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize) + { + const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + memset(acc, 0, tranche_depth * sizeof(acc[0])); + const int8_t *input_ptr = + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + for (int fy = filter_y_start; fy < filter_y_end; fy++) + { + const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); + for (int fx = filter_x_start; fx < filter_x_end; fx++) + { + const int8_t *input_channel_ptr = input_row_ptr; + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + int16x4_t acc_reg[4]; + int8x16_t input_reg = vld1q_s8(input_channel_ptr); + input_channel_ptr += 16; + acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg))); + acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg))); + acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg))); + acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg))); + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc + channel + 4 * i, + vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); + } + } + for (; channel <= tranche_depth - 8; channel += 8) + { + int16x4_t acc_reg[2]; + int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr)); + input_channel_ptr += 8; + acc_reg[0] = vget_low_s16(input_reg); + acc_reg[1] = vget_high_s16(input_reg); + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc + channel + 4 * i, + vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); + } + } +#endif + for (; channel < tranche_depth; ++channel) + { + acc[channel] += *input_channel_ptr++; + } + input_row_ptr += depth; + } + } + int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base); + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 8; channel += 8) + { + int16_t buf[8]; + for (int i = 0; i < 8; i++) + { + buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count + : (acc[channel + i] - filter_count / 2) / filter_count; + } + int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf)); + buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max)); + buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min)); + vst1_s8(output_ptr + channel, buf8); + } +#endif + for (; channel < tranche_depth; ++channel) + { + int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count + : (acc[channel] - filter_count / 2) / filter_count; + a = std::max<int16_t>(a, params.quantized_activation_min); + a = std::min<int16_t>(a, params.quantized_activation_max); + output_ptr[channel] = static_cast<int8_t>(a); + } + } + } + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h index fe5f87746..c7878496a 100644 --- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1, } template <BinaryArithmeticOpType op_type, typename T> -inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const T *input1_data, const Shape &input2_shape, - const T *input2_data, const Shape &output_shape, T *output_data) +inline typename std::enable_if_t<!is_quant8<T>::value> +BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>()); } -template <BinaryArithmeticOpType op_type> -inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, - uint8_t *output_data) +template <BinaryArithmeticOpType op_type, typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { switch (op_type) { case nnfw::cker::BinaryArithmeticOpType::ADD: case nnfw::cker::BinaryArithmeticOpType::SUB: - optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data, - output_shape, output_data); + optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); break; case nnfw::cker::BinaryArithmeticOpType::MUL: - optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, - const_cast<uint8_t *>(input2_data), output_shape, output_data); + optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: throw std::runtime_error{"Quant8 Asymm NYI"}; - default: assert(false); break; @@ -256,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap } template <BinaryArithmeticOpType op_type, typename T> -inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const T *input1_data, const Shape &input2_shape, - const T *input2_data, const Shape &output_shape, - T *output_data) +inline typename std::enable_if_t<!is_quant8<T>::value> +BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>()); } -template <BinaryArithmeticOpType op_type> -inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, - uint8_t *output_data) +template <BinaryArithmeticOpType op_type, typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { switch (op_type) { case nnfw::cker::BinaryArithmeticOpType::ADD: case nnfw::cker::BinaryArithmeticOpType::SUB: - optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape, - input2_data, output_shape, output_data); + optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::MUL: - optimized::BroadcastMulDispatchQuant8( - params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, - const_cast<uint8_t *>(input2_data), output_shape, output_data); + optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: case nnfw::cker::BinaryArithmeticOpType::POW: diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h index 5068eca96..145deda29 100644 --- a/compute/cker/include/cker/operation/BroadcastTo.h +++ b/compute/cker/include/cker/operation/BroadcastTo.h @@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo } } }; -} // functor +} // namespace functor template <typename T> inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape, diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h index b20bac3ac..16c937a27 100644 --- a/compute/cker/include/cker/operation/Conv.h +++ b/compute/cker/include/cker/operation/Conv.h @@ -138,6 +138,17 @@ public: } } + void operator()(const ConvParams ¶ms, const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, int8_t *output_data) + { + reference::Conv(params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(), + input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data); + } + std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; } + std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; } + private: bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor, int32_t dilation_height_factor) @@ -180,6 +191,9 @@ private: Shape _im2col_shape; bool _need_im2col; bool _prepared; + // Per channel output multiplier and shift. + std::vector<int32_t> _per_channel_output_multiplier; + std::vector<int> _per_channel_output_shift; }; } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h index 436ddd8c9..06ee780bb 100644 --- a/compute/cker/include/cker/operation/DepthwiseConv.h +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -24,6 +24,7 @@ #include "cker/neon/neon_check.h" #include "cker/operation/optimized/DepthwiseConvFloat.h" #include "cker/operation/optimized/DepthwiseConvUint8.h" +#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h" #include "cker/CpuBackendThreadpool.h" namespace nnfw diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h index 13fccfd15..6721a7508 100644 --- a/compute/cker/include/cker/operation/Einsum.h +++ b/compute/cker/include/cker/operation/Einsum.h @@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape) { return Shape::ExtendedShape(shape.DimensionsCount(), shape); } -} +} // namespace class Einsum { diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h index d657acc12..f88c3a5fb 100644 --- a/compute/cker/include/cker/operation/Fill.h +++ b/compute/cker/include/cker/operation/Fill.h @@ -24,7 +24,8 @@ namespace nnfw { namespace cker { -template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data) +template <typename T> +inline void Fill(const T *value_data, const Shape &output_shape, T *output_data) { int output_size = output_shape.FlatSize(); for (int i = 0; i < output_size; i++) diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h index cbebff142..f16e5019d 100644 --- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h +++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h @@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1) } } // namespace random -} // namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h index 7dc51fe94..6b7049ddf 100644 --- a/compute/cker/include/cker/operation/Helper/RandomOp.h +++ b/compute/cker/include/cker/operation/Helper/RandomOp.h @@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution> }; } // namespace functor -} // namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_OP_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h index 6e9ffbdfd..c99f69709 100644 --- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h +++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h @@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d } // namespace functor -} // end namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h index 5c82d111f..8e5fc22bb 100644 --- a/compute/cker/include/cker/operation/Quantize.h +++ b/compute/cker/include/cker/operation/Quantize.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved.* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +21,10 @@ #include "cker/Shape.h" #include "cker/Types.h" #include "cker/Utils.h" -#include <stdexcept> +#include <cassert> #include <iostream> +#include <stdexcept> + namespace nnfw { namespace cker @@ -41,6 +44,251 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S output_data[i] = clamped; } } + +inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size, + int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max, + int32_t *scratch, int8_t *output) +{ + // Here we're trying to quantize the raw accumulators: + // output_channels + // data data data data data + // rows data data data data data + // data data data data data + // .... + // + // In order to minimize the reload of the multipliers & shifts, once we load + // the multipliers & shifts, we load & quantize the raw accumulators for every + // row. +#ifdef USE_NEON + const int32x4_t output_offset_vec = vdupq_n_s32(output_zp); + const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min); + const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max); + const int32x4_t zeros = vdupq_n_s32(0); +#endif + + assert(total_size % channel_size == 0); + const int32_t rows = total_size / channel_size; + + int c = 0; + +#ifdef USE_NEON + using gemmlowp::RoundingDivideByPOT; + for (; c <= channel_size - 8; c += 8) + { + int32x4_t out_shift_1 = vld1q_s32(shift + c); + int32x4_t out_shift_2 = vld1q_s32(shift + c + 4); + int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros); + int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros); + + // Right shift will be performed as left shift with negative values. + int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros); + int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros); + + int32x4_t out_mul_1 = vld1q_s32(multiplier + c); + int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4); + for (int n = 0; n < rows; ++n) + { + int loc = n * channel_size + c; + int32x4_t acc_1 = vld1q_s32(scratch + loc); + int32x4_t acc_2 = vld1q_s32(scratch + loc + 4); + + // Saturating Rounding Doubling High Mul. + acc_1 = vshlq_s32(acc_1, left_shift_1); + acc_1 = vqrdmulhq_s32(acc_1, out_mul_1); + acc_2 = vshlq_s32(acc_2, left_shift_2); + acc_2 = vqrdmulhq_s32(acc_2, out_mul_2); + + // Rounding Dividing By POT. + acc_1 = vrshlq_s32(acc_1, right_shift_1); + acc_2 = vrshlq_s32(acc_2, right_shift_2); + + // Add the output offset. + acc_1 = vaddq_s32(acc_1, output_offset_vec); + acc_2 = vaddq_s32(acc_2, output_offset_vec); + + // Apply the activation function. + acc_1 = vmaxq_s32(acc_1, output_activation_min_vec); + acc_1 = vminq_s32(acc_1, output_activation_max_vec); + acc_2 = vmaxq_s32(acc_2, output_activation_min_vec); + acc_2 = vminq_s32(acc_2, output_activation_max_vec); + + // Saturating cast to int8 and store to destination. + const int16x4_t acc_s16_1 = vqmovn_s32(acc_1); + const int16x4_t acc_s16_2 = vqmovn_s32(acc_2); + const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2); + const int8x8_t res_s8 = vqmovn_s16(res_s16); + vst1_s8(output + loc, res_s8); + } + } + +#endif // USE_NEON + // Handle leftover values, one by one. This is very slow. + for (; c < channel_size; c++) + { + for (int n = 0; n < rows; ++n) + { + int loc = n * channel_size + c; + int32_t acc = scratch[loc]; + acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]); + acc += output_zp; + acc = std::max(acc, output_min); + acc = std::min(acc, output_max); + output[loc] = static_cast<int8_t>(acc); + } + } +} + +template <typename input_type, typename output_type> +inline void Requantize(const input_type *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data) +{ + assert(!"Requantize: not supported type. It shouldn't reach here."); + UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint, + output_zeropoint, output_data); +} + +template <> +inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size, + int32_t effective_scale_multiplier, + int32_t effective_scale_shift, int32_t input_zeropoint, + int32_t output_zeropoint, int8_t *output_data) +{ + static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + for (; i <= size - 16; i += 16) + { + const uint8x16_t input_vec = vld1q_u8(input_data + i); + const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec)); + const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec)); + int32x4x4_t input; + input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half))); + input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half))); + input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half))); + input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half))); + input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup); + input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup); + input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup); + input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup); + + int32x4x4_t result = + MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift); + + result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup); + result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup); + result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup); + result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup); + result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup); + result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup); + result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup); + result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup); + + const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]); + const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]); + const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]); + const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]); + const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2); + const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4); + const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half); + const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half); + const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half); + vst1q_s8(output_data + i, narrowed_result); + } + +#endif + for (; i < size; ++i) + { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + +template <> +inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size, + int32_t effective_scale_multiplier, + int32_t effective_scale_shift, int32_t input_zeropoint, + int32_t output_zeropoint, uint8_t *output_data) +{ + static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + for (; i <= size - 16; i += 16) + { + const int8x16_t input_vec = vld1q_s8(input_data + i); + const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec)); + const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec)); + int32x4x4_t input; + input.val[0] = vmovl_s16(vget_low_s16(first_half)); + input.val[1] = vmovl_s16(vget_high_s16(first_half)); + input.val[2] = vmovl_s16(vget_low_s16(second_half)); + input.val[3] = vmovl_s16(vget_high_s16(second_half)); + input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup); + input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup); + input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup); + input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup); + + int32x4x4_t result = + MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift); + + result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup); + result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup); + result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup); + result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup); + result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup); + result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup); + result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup); + result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup); + + const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]); + const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]); + const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]); + const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]); + + const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned); + const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned); + const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned); + const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned); + const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2); + const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4); + const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half); + const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half); + const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half); + vst1q_u8(output_data + i, narrowed_result); + } + +#endif + for (; i < size; ++i) + { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h index dbf938147..f54f2e6f1 100644 --- a/compute/cker/include/cker/operation/Reduce.h +++ b/compute/cker/include/cker/operation/Reduce.h @@ -46,6 +46,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape input_size *= input_dims[idx]; } reduce_size = input_dims[input_num_dims - 1]; + int offset = 0; for (int idx = 0; idx < input_size; idx++) { int r_idx = 0; @@ -55,14 +56,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data); for (; r_idx <= reduce_size - 32; r_idx += 32) { - float32x4_t a10 = vld1q_f32(input_data + r_idx); - float32x4_t a11 = vld1q_f32(input_data + r_idx + 4); - float32x4_t a12 = vld1q_f32(input_data + r_idx + 8); - float32x4_t a13 = vld1q_f32(input_data + r_idx + 12); - float32x4_t a20 = vld1q_f32(input_data + r_idx + 16); - float32x4_t a21 = vld1q_f32(input_data + r_idx + 20); - float32x4_t a22 = vld1q_f32(input_data + r_idx + 24); - float32x4_t a23 = vld1q_f32(input_data + r_idx + 28); + float32x4_t a10 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4); + float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8); + float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12); + float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16); + float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20); + float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24); + float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28); float32x4_t x0 = vaddq_f32(a10, a20); float32x4_t x1 = vaddq_f32(a11, a21); @@ -74,10 +75,23 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape float32x4_t y2 = vaddq_f32(y0, y1); tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2); } + for (; r_idx <= reduce_size - 16; r_idx += 16) + { + float32x4_t a10 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4); + float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8); + float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12); + + float32x4_t x0 = vaddq_f32(a10, a11); + float32x4_t x1 = vaddq_f32(a12, a13); + + float32x4_t y0 = vaddq_f32(x0, x1); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0); + } for (; r_idx <= reduce_size - 8; r_idx += 8) { - float32x4_t a1 = vld1q_f32(input_data + r_idx); - float32x4_t a2 = vld1q_f32(input_data + r_idx + 4); + float32x4_t a1 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4); float32x4_t x = vaddq_f32(a1, a2); tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x); } @@ -88,13 +102,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape { if (r_idx == 0) { - output_data[idx] = input_data[idx * reduce_size]; + output_data[idx] = input_data[offset]; } else { - output_data[idx] += input_data[idx * reduce_size + r_idx]; + output_data[idx] += input_data[offset + r_idx]; } } + offset += reduce_size; } } #endif // NEON diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h index 8d9a7495f..ae5af7bb3 100644 --- a/compute/cker/include/cker/operation/ResizeBilinear.h +++ b/compute/cker/include/cker/operation/ResizeBilinear.h @@ -264,6 +264,91 @@ void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, batches, input_height, input_width, depth, params.output_height, params.output_width, height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); } + +inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10, + const bool half_pixel_centers, int32_t input_size, + int32_t *scaled_value, int32_t *lower_bound, + int32_t *upper_bound) +{ + if (half_pixel_centers) + { + *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9); + } + else + { + *scaled_value = value * scale_10; + } + *lower_bound = std::max(*scaled_value / (1 << 10), 0); + *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1); +} + +inline void ResizeBilinear(const ResizeBilinearParams &op_params, + const Shape &unextended_input_shape, const int8_t *input_data, + const Shape &unextended_output_shape, int8_t *output_data) +{ + // If half_pixel_centers is True, align_corners must be False. + assert(!op_params.half_pixel_centers || !op_params.align_corners); + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_height = input_shape.Dims(1); + const int32_t input_width = input_shape.Dims(2); + const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); + + const int32_t output_height = op_params.output_height; + const int32_t output_width = op_params.output_width; + + int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height; + int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width; + if (op_params.align_corners && output_height > 1) + { + height_scale_10 = + ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1); + } + if (op_params.align_corners && output_width > 1) + { + width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1); + } + + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + int32_t input_y, y0, y1; + ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height, + &input_y, &y0, &y1); + for (int x = 0; x < output_width; ++x) + { + int32_t input_x, x0, x1; + ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width, + &input_x, &x0, &x1); + for (int c = 0; c < depth; ++c) + { + const int64_t output_20_ll = + static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) * + ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0)); + const int64_t output_20_lu = + static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) * + (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0)); + const int64_t output_20_rl = + static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) * + ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0); + const int64_t output_20_ru = + static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) * + (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0); + const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru; + const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19); + const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20)); + output_data[Offset(output_shape, b, y, x, c)] = interpolation; + } + } + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h index 620c1f968..35ecde4ba 100644 --- a/compute/cker/include/cker/operation/SoftMax.h +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -23,6 +23,10 @@ #include "cker/Types.h" #include "cker/eigen/Utils.h" +#if __aarch64__ && __clang__ +#define TFLITE_SOFTMAX_USE_UINT16_LUT +#endif + #include <Eigen/Core> #include <fixedpoint/fixedpoint.h> #include <cmath> @@ -69,7 +73,7 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const } } } -} +} // namespace reference // Performs softmax along the input of size (input_size * batch_size). inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta, @@ -127,87 +131,306 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const out_mat.array().rowwise() *= scale; } -inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, - const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) -{ - const int32_t input_beta_multiplier = params.input_multiplier; - const int32_t input_beta_left_shift = params.input_left_shift; - const int diff_min = params.diff_min; - // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. - static const int kScaledDiffIntegerBits = 5; - static const int kAccumulationIntegerBits = 12; - using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>; - using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>; - using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>; +template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) +{ + const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled)); + return prob_rnd + zero_point; +} + +#if !__aarch64__ +// With ARM64, rounding is faster than add + truncation. +template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t) +{ + return static_cast<int32_t>(prob_rescaled + 0.5f); +} +#endif + +inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta) +{ + const float scale = -input_scale * beta; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + for (int32_t val = 0; val <= max_uint8; ++val) + { + table[max_uint8 - val] = expf(scale * val); + } +} +template <typename In, typename Out> +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const In *input_data, + const Shape &output_shape, Out *output_data) +{ const int trailing_dim = input_shape.DimensionsCount() - 1; - const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); - const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); - for (int i = 0; i < outer_size; ++i) + const int32_t clamp_max = std::numeric_limits<Out>::max(); + const int32_t clamp_min = std::numeric_limits<Out>::min(); + for (int i = 0; i < excluding_last_dim; ++i) { - uint8_t max_in_row = 0; - for (int c = 0; c < depth; ++c) + int32_t max_val = std::numeric_limits<In>::min(); + // Find max quantized value. + for (int j = 0; j < last_dim; ++j) { - max_in_row = std::max(max_in_row, input_data[i * depth + c]); + max_val = std::max(max_val, static_cast<int32_t>(input_data[j])); } - FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); - for (int c = 0; c < depth; ++c) + float sum_exp = 0.0f; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const float *table_offset = ¶ms.table[max_uint8 - max_val]; + // Calculate normalizer sum(exp(x)). + for (int j = 0; j < last_dim; ++j) { - int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) - { - const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); - } + sum_exp += table_offset[input_data[j]]; } - int32_t fixed_sum_of_exps = sum_of_exps.raw(); - int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps)); - // This is the number of bits to the left of the binary point above 1.0. - // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and - // no later adjustment will be needed. - int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; - int32_t shifted_sum_minus_one = - static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32_t>(1) << 31)); + const float inv_sum_exp = 1.0f / (sum_exp * params.scale); + // Normalize and quantize probabilities. + for (int j = 0; j < last_dim; ++j) + { + const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp; + const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point); + output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); + } + input_data += last_dim; + output_data += last_dim; + } +} + +#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT +// Looks up each element of <indices> in <table>, returns them in a vector. +inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices) +{ + // Look up in 1st quarter of the table: top 2 bits of indices == 00 + uint8x16_t output1 = vqtbl4q_u8(table[0], indices); + // Look up in 2nd quarter of the table: top 2 bits of indices == 01 + uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40))); + // Look up in 3rd quarter of the table: top 2 bits of indices == 10 + uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80))); + // Look up in 4th quarter of the table: top 2 bits of indices == 11 + uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0))); + + // Combine result of the 4 lookups. + return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4)); +} - FixedPoint0 shifted_scale = - one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); +inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2, + float input_scale, float beta) +{ + const float scale = input_scale * beta; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const int32_t max_uint16 = std::numeric_limits<uint16_t>::max(); - for (int c = 0; c < depth; ++c) + for (int32_t val = 0; val <= max_uint8; ++val) + { + float input_to_exp = scale * (val - max_uint8); + int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5); + temp = std::min(max_uint16, temp); + uint8_t part1 = temp >> 8; + uint8_t part2 = temp & 0xff; + uint8_table1[val] = static_cast<uint8_t>(part1); + uint8_table2[val] = static_cast<uint8_t>(part2); + } +} + +inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset) +{ + int32_t max_val = std::numeric_limits<uint8_t>::min(); + int j = 0; + + uint8x16_t max_val_dup = vdupq_n_u8(max_val); + uint8x16_t offset_dup = vdupq_n_u8(offset); + for (; j <= size - 16; j += 16) + { + uint8x16_t input_value = vld1q_u8(input_data + j); + input_value = veorq_u8(input_value, offset_dup); + max_val_dup = vmaxq_u8(input_value, max_val_dup); + } + max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup))); + + for (; j < size; ++j) + { + max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset)); + } + return max_val; +} + +#ifdef USE_NEON +// Value_to_store layout: +// [high_high, high_low, low_high, low_low]. +inline void StoreValue(int32x4x4_t value_to_store, int8_t *output) +{ + const int16x8_t result_1 = + vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0])); + const int16x8_t result_2 = + vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2])); + const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1)); + vst1q_s8(output, result); +} + +// Value_to_store layout: +// [high_high, high_low, low_high, low_low]. +inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output) +{ + const uint16x8_t result_1 = + vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])), + vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0]))); + const uint16x8_t result_2 = + vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])), + vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2]))); + const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1)); + vst1q_u8(output, result); +} + +#endif + +template <typename In, typename Out> +inline void SoftmaxInt8LUT(const SoftmaxParams ¶ms, const Shape &input_shape, + const In *input_data, const Shape &output_shape, Out *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + const int32_t clamp_max = std::numeric_limits<Out>::max(); + const int32_t clamp_min = std::numeric_limits<Out>::min(); + + // Offset is used to interpret the input data "correctly". + // If the input is uint8, the data will be unchanged. + // If the input is int8, since it will be reinterpret as uint8. + // e.g., + // int8 127 will be applied "offset" to become 255 in uint8. + uint8_t offset = 0; + if (std::is_same<In, int8_t>::value) + { + offset = 0x80; + } + + const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data); + + // This code uses ARM64-only instructions. + // TODO(b/143709993): Port to ARMv7 + + // Load the tables into registers. (4*4 128-bit registers) + uint8x16x4_t table1[4]; + table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0); + table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1); + table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2); + table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3); + + uint8x16x4_t table2[4]; + table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0); + table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1); + table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2); + table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3); + + for (int i = 0; i < excluding_last_dim; ++i) + { + // Find max quantized value. + int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset); + + int32_t sum_exp = 0; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const uint8_t table_offset = max_uint8 - max_val; + + // Calculate normalizer sum(exp(x)). + int sum_j = 0; + uint8x16_t table_offset_dup = vdupq_n_u8(table_offset); + uint8x16_t offset_dup = vdupq_n_u8(offset); + uint32x4_t sum_4 = vdupq_n_u32(0); + const int multiplier_shift = 8; + for (; sum_j <= last_dim - 16; sum_j += 16) { - int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) - { - const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - - FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), - num_bits_over_unit + 31 - 8); - - output_data[i * depth + c] = static_cast<uint8_t>( - std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); - } - else - { - output_data[i * depth + c] = 0; - } + uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j); + input_value = veorq_u8(input_value, offset_dup); + input_value = vaddq_u8(input_value, table_offset_dup); + + const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value); + const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value); + + uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift); + uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift); + + exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2)); + exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2)); + + sum_4 = vpadalq_u16(sum_4, exp_value1); + sum_4 = vpadalq_u16(sum_4, exp_value2); + } + int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) + + vgetq_lane_u32(sum_4, 3); + sum_exp += temp; + + for (; sum_j < last_dim; ++sum_j) + { + const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset; + + uint8_t part1 = params.uint8_table1[index]; + uint8_t part2 = params.uint8_table2[index]; + sum_exp += ((part1 << 8) + part2); + } + + const float inv_sum_exp = 1.0f / (sum_exp * params.scale); + + int32_t multiplier, shift; + QuantizeMultiplier(inv_sum_exp, &multiplier, &shift); + + // Normalize and quantize probabilities. + int j = 0; + const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point); + const int32x4_t max_val_dup = vdupq_n_s32(clamp_max); + const int32x4_t min_val_dup = vdupq_n_s32(clamp_min); + + for (; j <= last_dim - 16; j += 16) + { + uint8x16_t input_value = vld1q_u8(input_data_uint + j); + input_value = veorq_u8(input_value, offset_dup); + input_value = vaddq_u8(input_value, table_offset_dup); + + const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value); + const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value); + + uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift); + uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift); + + exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2)); + exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2)); + + int32x4x4_t output_value; + output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1))); + output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1))); + output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2))); + output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2))); + + int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift); + + temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup); + temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup); + temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup); + temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup); + + temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup); + temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup); + temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup); + temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup); + + StoreValue(temp_val, output_data + j); + } + for (; j < last_dim; ++j) + { + const uint8_t index = (input_data_uint[j] ^ offset) + table_offset; + const uint8_t part1 = params.uint8_table1[index]; + const uint8_t part2 = params.uint8_table2[index]; + const int32_t exp_value = (part1 << 8) + part2; + const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift); + + output_data[j] = static_cast<Out>( + std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min)); } + input_data_uint += last_dim; + output_data += last_dim; } } +#endif } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h index cdd812a08..dcf649ca1 100644 --- a/compute/cker/include/cker/operation/StatelessRandomUniform.h +++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h @@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output) Distribution()); } -inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data, - const Shape &seed_shape, const int *seed_data, +inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data, + const Shape &seed_shape, const int32_t *seed_data, const Shape &output_shape, float *output_data) { Tensor shape_t; diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h index 62eb432ae..52c826c39 100644 --- a/compute/cker/include/cker/operation/Transpose.h +++ b/compute/cker/include/cker/operation/Transpose.h @@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp return flat_size; } -} // namespace anonymous (util) +} // namespace // Transpose2D only deals with typical 2D matrix transpose ops. // Perform transpose by transposing 4x4 blocks of the input, proceeding from diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h index 8c1d31b56..1fe3e1517 100644 --- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h @@ -35,6 +35,7 @@ namespace cker namespace optimized { +/* Old version: For Sub(float) and Div. */ template <typename ElementwiseF, typename ScalarBroadcastF, typename T> inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, bool switch_inputs, const Shape & /* unswitched_input1_shape */, @@ -122,8 +123,108 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, bool } } -inline int32_t quant8_sum(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, - const uint8_t input2_data) +// New version: For Mul, Add and Sub(quant8) +template <typename ElementwiseF, typename ScalarBroadcastF, typename T> +inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params, + const Shape & /* unswitched_input1_shape */, + const T *unswitched_input1_data, + const Shape & /* unswitched_input2_shape */, + const T *unswitched_input2_data, + const Shape & /* output_shape */, T *output_data, + ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f) +{ + BinaryArithmeticOpParam switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const BinaryArithmeticOpParam ¶ms = use_unswitched ? unswitched_params : switched_params; + const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. + T *output_data_ptr = output_data; + const T *input1_data_ptr = input1_data; + const T *input2_data_reset = input2_data; + // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared + // between input shapes. y3 for input 1 is always broadcast, and so the + // dimension there is 1, whereas optionally y1 might be broadcast for + // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4, + // input2.shape.FlatSize = y0 * y2 * y3 * y4. + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + if (y4 > 1) + { + // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner + // dimension. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + for (int i3 = 0; i3 < y3; ++i3) + { + elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + // We have broadcast y4 of input1 data y3 times, and now move on. + input1_data_ptr += y4; + } + } + // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. + input2_data_reset = input2_data_ptr; + } + } + else + { + // Special case of y4 == 1, in which the innermost loop is a single + // element and can be combined with the next (y3) as an inner broadcast. + // + // Note that this handles the case of pure scalar broadcast when + // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar + // broadcast with batch (as y2 > 1). + // + // NOTE The process is the same as the above general case except + // simplified for y4 == 1 and the loop over y3 is contained within the + // AddScalarBroadcast function. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y3; + output_data_ptr += y3; + input1_data_ptr += 1; + } + } + input2_data_reset = input2_data_ptr; + } + } +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value, int32_t> +quant8_sum(const BinaryArithmeticOpParam ¶ms, const T input1_data, const T input2_data) { const int32_t input1_val = params.input1_offset + input1_data; const int32_t input2_val = params.input2_offset + input2_data; @@ -142,9 +243,9 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam ¶ms, const uint8_t i return clamped_output; } -inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t *input1_data, const uint8_t *input2_data, - uint8_t *output_data) +inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; @@ -218,6 +319,119 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms } } +inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const int8_t *input1_data, const int8_t *input2_data, + int8_t *output_data) +{ + int i = 0; +#ifdef USE_NEON + const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + + const int input1_left_shift = params.left_shift + params.input1_shift; + const int input2_left_shift = params.left_shift + params.input2_shift; + const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift); + const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift); + + const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset); + + for (; i <= size - 16; i += 16) + { + const int8x16_t input1_val_original = vld1q_s8(input1_data + i); + const int8x16_t input2_val_original = vld1q_s8(input2_data + i); + + const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original)); + const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original)); + + const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup); + const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup); + const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup); + const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup); + const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); + const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); + const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); + const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low); + const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high); + const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high); + const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low); + const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low); + int32x4_t x111 = vmovl_s16(input1_val_low_low); + int32x4_t x112 = vmovl_s16(input1_val_low_high); + int32x4_t x121 = vmovl_s16(input1_val_high_low); + int32x4_t x122 = vmovl_s16(input1_val_high_high); + int32x4_t x211 = vmovl_s16(input2_val_low_low); + int32x4_t x212 = vmovl_s16(input2_val_low_high); + int32x4_t x221 = vmovl_s16(input2_val_high_low); + int32x4_t x222 = vmovl_s16(input2_val_high_high); + + x111 = vshlq_s32(x111, input1_left_dup); + x112 = vshlq_s32(x112, input1_left_dup); + x121 = vshlq_s32(x121, input1_left_dup); + x122 = vshlq_s32(x122, input1_left_dup); + x211 = vshlq_s32(x211, input2_left_dup); + x212 = vshlq_s32(x212, input2_left_dup); + x221 = vshlq_s32(x221, input2_left_dup); + x222 = vshlq_s32(x222, input2_left_dup); + x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier); + x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier); + x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier); + x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier); + x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier); + x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier); + x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier); + x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier); + int32x4_t s11 = vaddq_s32(x111, x211); + int32x4_t s12 = vaddq_s32(x112, x212); + int32x4_t s21 = vaddq_s32(x121, x221); + int32x4_t s22 = vaddq_s32(x122, x222); + s11 = vqrdmulhq_n_s32(s11, params.output_multiplier); + s12 = vqrdmulhq_n_s32(s12, params.output_multiplier); + s21 = vqrdmulhq_n_s32(s21, params.output_multiplier); + s22 = vqrdmulhq_n_s32(s22, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + s11 = RoundingDivideByPOT(s11, -params.output_shift); + s12 = RoundingDivideByPOT(s12, -params.output_shift); + s21 = RoundingDivideByPOT(s21, -params.output_shift); + s22 = RoundingDivideByPOT(s22, -params.output_shift); + const int16x4_t s11_narrowed = vmovn_s32(s11); + const int16x4_t s12_narrowed = vmovn_s32(s12); + const int16x4_t s21_narrowed = vmovn_s32(s21); + const int16x4_t s22_narrowed = vmovn_s32(s22); + const int16x8_t s1 = + vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset)); + const int16x8_t s2 = + vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset)); + const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2)); + + const int8x16_t clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s)); + vst1q_s8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) + { + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + struct BinaryOpFuncAddFloat { #ifdef USE_NEON @@ -473,12 +687,13 @@ getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam ¶ms) BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>); } -inline void AddQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); } inline void Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -493,9 +708,9 @@ inline void Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape // Scalar-broadcast add that can be used for inner loop of more general // broadcast add, so that, for example, scalar-broadcast with batch will still // be fast. -inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, - uint8_t broadcast_value, const uint8_t *input2_data, - uint8_t *output_data) +inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; int32_t clamped_output; @@ -506,31 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa } } -inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam ¶ms, - const Shape &input1_shape, const uint8_t *input1_data, - const Shape &input2_shape, const uint8_t *input2_data, - const Shape &output_shape, uint8_t *output_data) +// Scalar-broadcast add that can be used for inner loop of more general +// broadcast add, so that, for example, scalar-broadcast with batch will still +// be fast. +inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, int8_t input1_data, + const int8_t *input2_data, int8_t *output_data) { - if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + using gemmlowp::RoundingDivideByPOT; + int i = 0; +#ifdef USE_NEON + const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift); + const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min); + const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max); + + // Process broadcast scalar. + const int8x8_t input1_val_original = vdup_n_s8(input1_data); + const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original); + const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset)); + const int16x4_t input1_val_high = vget_high_s16(input1_val); + const int16x4_t input1_val_low = vget_low_s16(input1_val); + int32x4_t x11 = vmovl_s16(input1_val_low); + int32x4_t x12 = vmovl_s16(input1_val_high); + x11 = vshlq_s32(x11, left_shift_dup); + x12 = vshlq_s32(x12, left_shift_dup); + x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier); + const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift); + x11 = vshlq_s32(x11, input1_shift_dup); + x12 = vshlq_s32(x12, input1_shift_dup); + + for (; i <= size - 8; i += 8) { - const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = - [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, const uint8_t &b) -> uint8_t { - return static_cast<uint8_t>(quant8_sum(params, a, b)); - }; - reference::BroadcastBinaryArithmeticOpSlowQuant8( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); + const int8x8_t input2_val_original = vld1_s8(input2_data + i); + const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original); + const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset)); + const int16x4_t input2_val_high = vget_high_s16(input2_val); + const int16x4_t input2_val_low = vget_low_s16(input2_val); + int32x4_t x21 = vmovl_s16(input2_val_low); + int32x4_t x22 = vmovl_s16(input2_val_high); + x21 = vshlq_s32(x21, left_shift_dup); + x22 = vshlq_s32(x22, left_shift_dup); + x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier); + const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift); + x21 = vshlq_s32(x21, input2_shift_dup); + x22 = vshlq_s32(x22, input2_shift_dup); + int32x4_t s1 = vaddq_s32(x11, x21); + int32x4_t s2 = vaddq_s32(x12, x22); + s1 = vqrdmulhq_n_s32(s1, params.output_multiplier); + s2 = vqrdmulhq_n_s32(s2, params.output_multiplier); + s1 = RoundingDivideByPOT(s1, -params.output_shift); + s2 = RoundingDivideByPOT(s2, -params.output_shift); + const int16x4_t s1_narrowed = vmovn_s32(s1); + const int16x4_t s2_narrowed = vmovn_s32(s2); + const int16x8_t s = + vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); + const int8x8_t clamped = + vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s))); + vst1_s8(output_data + i, clamped); } - else +#endif // NEON + + if (i < size) { - BinaryBroadcastFiveFold( - params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(AddElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(AddScalarBroadcastQuant8)); + // Process broadcast scalar. + const int32_t input1_val = params.input1_offset + input1_data; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + + for (; i < size; ++i) + { + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<int8_t>(clamped_output); + } + } +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) + { + const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn = + [](const BinaryArithmeticOpParam ¶ms, const T &a, const T &b) { + return static_cast<T>(quant8_sum(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + return; } + + BinaryBroadcastFiveFold( + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>( + AddElementwise), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>( + AddScalarBroadcast)); } inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -592,8 +891,9 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam ¶ms, const Sh } } -inline int32_t quant8_mul(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, - const uint8_t input2_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value, int32_t> +quant8_mul(const BinaryArithmeticOpParam ¶ms, const T input1_data, const T input2_data) { const int32_t input1_val = params.input1_offset + input1_data; const int32_t input2_val = params.input2_offset + input2_data; @@ -607,9 +907,9 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam ¶ms, const uint8_t i return clamped_output; } -inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t *input1_data, const uint8_t *input2_data, - uint8_t *output_data) +inline void MulElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; @@ -671,12 +971,102 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms } } -inline void MulQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +inline void MulElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const int8_t *input1_data, const int8_t *input2_data, + int8_t *output_data) +{ + int i = 0; +#ifdef USE_NEON + const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset); + const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); + for (; i <= size - 16; i += 16) + { + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const int8x16_t input1_val_original = vld1q_s8(input1_data + i); + const int8x16_t input2_val_original = vld1q_s8(input2_data + i); + + const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original)); + const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original)); + + const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector); + const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector); + const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector); + const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector); + const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); + const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); + const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); + const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low); + const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high); + const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high); + const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low); + const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low); + + auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high); + auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low); + auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high); + auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low); + + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); + + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); + + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) + { + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); + MulElementwise(flat_size, params, input1_data, input2_data, output_data); } inline void Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -688,9 +1078,9 @@ inline void Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); } -inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t broadcast_value, const uint8_t *input2_data, - uint8_t *output_data) +inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; int32_t clamped_output; @@ -701,29 +1091,109 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa } } -inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam ¶ms, - const Shape &input1_shape, const uint8_t *input1_data, - const Shape &input2_shape, const uint8_t *input2_data, - const Shape &output_shape, uint8_t *output_data) +// Broadcast mul that can often be used for inner loop of broadcast Mul. +inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + const int8_t broadcast_value, const int8_t *input2_data, + int8_t *output_data) +{ + const int16_t input1_val = params.input1_offset + broadcast_value; + + int i = 0; +#ifdef USE_NEON + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); + for (; i <= size - 16; i += 16) + { + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const auto input2_val_original = vld1q_s8(input2_data + i); + const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + + const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector); + const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector); + + const auto input2_val_low_low = vget_low_s16(input2_val_low); + const auto input2_val_low_high = vget_high_s16(input2_val_low); + const auto input2_val_high_low = vget_low_s16(input2_val_high); + const auto input2_val_high_high = vget_high_s16(input2_val_high); + + auto p1 = vmull_n_s16(input2_val_high_high, input1_val); + auto p2 = vmull_n_s16(input2_val_high_low, input1_val); + auto p3 = vmull_n_s16(input2_val_low_high, input1_val); + auto p4 = vmull_n_s16(input2_val_low_low, input1_val); + + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); + + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); + + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) + { + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { - const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = - [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, const uint8_t &b) -> uint8_t { - return static_cast<uint8_t>(quant8_mul(params, a, b)); - }; - reference::BroadcastBinaryArithmeticOpSlowQuant8( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); + const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn = + [](const BinaryArithmeticOpParam ¶ms, const T &a, const T &b) { + return static_cast<T>(quant8_mul(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); return; } BinaryBroadcastFiveFold( - params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(MulElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(MulSimpleBroadcastQuant8)); + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>( + MulElementwise), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>( + MulSimpleBroadcast)); } inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -741,10 +1211,8 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Sh return; } auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); - BinaryBroadcastFiveFold( - params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, - input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - implFuncs.first, implFuncs.second); + BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); } inline void Div(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h index d4397933a..17b2fc7a2 100644 --- a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h @@ -1243,8 +1243,8 @@ inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &in } } -} // nnfw -} // cker -} // optimized +} // namespace optimized +} // namespace cker +} // namespace nnfw #endif diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h new file mode 100644 index 000000000..bd8497920 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h @@ -0,0 +1,2138 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ +#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ + +#include "cker/CpuBackendThreadpool.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" +#include "cker/operation/Quantize.h" + +#include <fixedpoint/fixedpoint.h> +#include <public/gemmlowp.h> + +namespace nnfw +{ +namespace cker +{ +namespace optimized_integer_ops +{ + +// Category of depthwise convolution output rounding. +enum class DepthwiseConvOutputRounding +{ + kNone = 0, // Invalid: specific method must be specified. + kAwayFromZero, // Original method: exact halves rounded away from zero. + kUpward, // Halves towards +infinity: adds 0.5 before truncate. + // This is where a future kNearestEven would be placed. +}; + +// Category of depthwise convolution depth multiplication. +enum class DepthwiseConvDepthMultiplication +{ + kNoMultiplication = 0, // Depth multiplier = 1. + kUnitInputDepth, // Input depth = 1, output depth = depth multiplier. +}; + +namespace depthwise_conv +{ + +// Implementation of quantized DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct QuantizedDepthwiseConvKernel +{ +}; + +#ifdef USE_NEON +template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8x2_t filter_s8; + filter_s8.val[0] = vld1_s8(filter_ptr); + filter_s8.val[1] = vld1_s8(filter_ptr + 8); + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += input_ptr_increment; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[0].val[i] = + vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); + acc[1].val[i] = + vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + acc[0] = vld1q_s32(acc_buffer_ptr); + acc[1] = vld1q_s32(acc_buffer_ptr + 4); + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc[0]); + vst1q_s32(acc_buffer_ptr + 4, acc[1]); + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = + vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4x2_t input_dup2 = vzip_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i); + filter[i] = vmovl_s8(filter_s8); + } + int outp = 0; + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input_dup2); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x2_t acc = vld1_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); + acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); + acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); + acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vmlal_n_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i); + const int16x8_t input_s16 = vmovl_s8(input_s8); + input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + } + input_ptr += 16; + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i); + filter[i] = vmovl_s8(filter_s8); + } + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // We will have to duplicate bytes in a NEON register, 3-fold. + // We will do that by register-level table-look-up using VTBL instructions. + // Here we prepare the registers containing the table-lookup indices. + static const int8_t dup3_indices_array[3][8] = { + {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; + int8x8_t dup3_indices[3]; + for (int i = 0; i < 3; i++) + { + dup3_indices[i] = vld1_s8(dup3_indices_array[i]); + } + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + int16x8_t filter[3]; + int8x8x3_t filter_s8; + filter_s8.val[0] = vld1_s8(local_filter_ptr); + filter_s8.val[1] = vld1_s8(local_filter_ptr + 8); + filter_s8.val[2] = vld1_s8(local_filter_ptr + 16); + local_filter_ptr += 24; + for (int i = 0; i < 3; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Load the inputs, duplicate 3-fold, add input_offset. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + + int8x8_t input_s8_dup3[3]; + for (int i = 0; i < 3; i++) + { + input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]); + } + int16x8_t input_dup3[3]; + for (int i = 0; i < 3; i++) + { + const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]); + input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4x3_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); + } + // Multiply-accumulate + for (int j = 0; j < 3; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); + } + acc_buffer_ptr += 24; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const int16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 3; i++) + { + *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val; + } + local_filter_ptr += 3; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + int16x8_t filter[2]; + int8x8x2_t filter_s8; + filter_s8.val[0] = vld1_s8(local_filter_ptr); + filter_s8.val[1] = vld1_s8(local_filter_ptr + 8); + local_filter_ptr += 16; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Load the inputs, add input_offset, duplicate 2-fold. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Load the accumulators from acc_buffer. + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Multiply-accumulate. + for (int j = 0; j < 2; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); + } + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the inputs. + const int16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 2; i++) + { + *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val; + } + local_filter_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1); + local_filter_ptr += 16; + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + // Load the inputs, add input_offset. + int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0); + int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1); + local_input_ptr += 16; + int16x8_t input_0 = vmovl_s8(input_s8_0); + int16x8_t input_1 = vmovl_s8(input_s8_1); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); + acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); + acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + acc_buffer_ptr += 16; + } + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(local_filter_ptr); + local_filter_ptr += 8; + const int16x8_t filter = vmovl_s8(filter_s8); + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const int16_t input_val = *local_input_ptr++ + input_offset; + const int16_t filter_val = *local_filter_ptr++; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8[2]; + for (int i = 0; i < 2; i++) + { + filter_s8[i] = vld1_s8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += input_ptr_increment; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8[2]; + for (int i = 0; i < 2; i++) + { + filter_s8[i] = vld1_s8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); + acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1); + int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2); + int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3); + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + int16x8_t filter_2 = vmovl_s8(filter_s8_2); + int16x8_t filter_3 = vmovl_s8(filter_s8_3); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); + int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); + int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); + acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); + acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); + acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); + vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); + vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); + acc_buffer_ptr += 32; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. + // We load the first 16 bytes into filter_s8_{0,1} as usual. + // Then we load the 8 last bytes into filter_s8_x (x for 'extra'). + // This is redundant: the first 4 bytes of filter_s8_x are the same + // as the last 4 bytes of filter_s8_x. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1); + int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4); + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + int16x8_t filter_x = vmovl_s8(filter_s8_x); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + acc_buffer_ptr += 20; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); + acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int16x4_t input_s16 = vdup_n_s16(0); + input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0); + input_ptr += input_ptr_increment; + input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1); + input_ptr += input_ptr_increment; + input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + if (num_output_pixels <= 0) + { + return; + } + + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + + // Handle one output pixel at a time until second to the last pixel. Second + // to the last because we read eight input pixels while only processing + // four. + for (; outp < num_output_pixels - 1; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle the last output pixel. + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 12, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4); + int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1); + int16x4_t filter_0 = vget_low_s16(filter_s16_0); + int16x4_t filter_1 = vget_high_s16(filter_s16_0); + int16x4_t filter_2 = vget_high_s16(filter_s16_1); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + int8x8_t input_s8_0 = vld1_s8(input_ptr); + int8x8_t input_s8_1 = vld1_s8(input_ptr + 4); + input_ptr += input_ptr_increment; + int16x8_t input_0 = vmovl_s8(input_s8_0); + int16x8_t input_1 = vmovl_s8(input_s8_1); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + + // Multiply-accumulate + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); + acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); + + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + + acc_buffer_ptr += 12; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, + int input_width, const int8_t *input_data, int16_t input_offset, + int pad_width, int depth_multiplier, int filter_width, + const int8_t *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, int32_t *acc_buffer) +{ + // Consistency check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + assert(stride == 1 || kAllowStrided); + if (kFixedInputDepth) + { + assert(input_depth == kFixedInputDepth); + } + if (kFixedDepthMultiplier) + { + assert(depth_multiplier == kFixedDepthMultiplier); + } + assert(output_depth == input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const int8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclamped = 0; + int out_x_loop_end_unclamped = 0; + if (kAllowStrided) + { + if (stride == 2) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2; + } + else if (stride == 4) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4; + } + else + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride; + out_x_loop_end_unclamped = + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + } + } + else + { + out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; + out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped); + const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const int8_t *input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. +inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, + int input_width, const int8_t *input_data, + int16_t input_offset, int pad_width, + int depth_multiplier, int filter_width, + const int8_t *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, + int32_t *acc_buffer) +{ + const int8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const int8_t *input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) + { + const int8_t *filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) + { + const int16_t input_val = *input_ptr++ + input_offset; + for (int m = 0; m < depth_multiplier; m++) + { + const int16_t filter_val = *filter_ptr++; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const int32_t *bias_data, int32_t *acc_buffer) +{ + int i = 0; +#ifdef USE_NEON + if (output_depth == 1) + { + const int32x4_t b = vdupq_n_s32(bias_data[0]); + for (; i <= num_output_pixels - 16; i += 16) + { + vst1q_s32(acc_buffer + i + 0, b); + vst1q_s32(acc_buffer + i + 4, b); + vst1q_s32(acc_buffer + i + 8, b); + vst1q_s32(acc_buffer + i + 12, b); + } + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + i, b); + } + } + else if (output_depth == 2) + { + int32x4_t b = vdupq_n_s32(bias_data[0]); + b = vsetq_lane_s32(bias_data[1], b, 1); + b = vsetq_lane_s32(bias_data[1], b, 3); + for (; i <= num_output_pixels - 8; i += 8) + { + vst1q_s32(acc_buffer + 2 * i + 0, b); + vst1q_s32(acc_buffer + 2 * i + 4, b); + vst1q_s32(acc_buffer + 2 * i + 8, b); + vst1q_s32(acc_buffer + 2 * i + 12, b); + } + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 2 * i, b); + } + } + else if (output_depth == 4) + { + const int32x4_t b = vld1q_s32(bias_data); + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + 4 * i + 0, b); + vst1q_s32(acc_buffer + 4 * i + 4, b); + vst1q_s32(acc_buffer + 4 * i + 8, b); + vst1q_s32(acc_buffer + 4 * i + 12, b); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 4 * i, b); + } + } + else if (output_depth == 8) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + vst1q_s32(acc_buffer + 8 * i + 8, b0); + vst1q_s32(acc_buffer + 8 * i + 12, b1); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + } + } + else if (output_depth == 16) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + const int32x4_t b2 = vld1q_s32(bias_data + 8); + const int32x4_t b3 = vld1q_s32(bias_data + 12); + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 16 * i + 0, b0); + vst1q_s32(acc_buffer + 16 * i + 4, b1); + vst1q_s32(acc_buffer + 16 * i + 8, b2); + vst1q_s32(acc_buffer + 16 * i + 12, b3); + } + } +#endif + for (; i < num_output_pixels; i++) + { + memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth); + } +} + +inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape & /* bias_shape */, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, int thread_start, + int thread_end, int thread_dim) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_rows = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + static const int kAccBufferMaxSize = 2048; + int32_t acc_buffer[kAccBufferMaxSize]; + assert(kAccBufferMaxSize >= output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + UNUSED_RELEASE(kAccBufferActualSize); + assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); + assert(kAccBufferActualSize <= kAccBufferMaxSize); + assert(kOutputPixelsInAccBuffer >= 1); + assert(thread_dim == 0 || thread_dim == 1); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); + row_accum_func_t row_accum_func = nullptr; + +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \ + if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ + (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ + { \ + row_accum_func = \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + // We go over our list of kernels by decreasing order of preference + // for the cases where multiple kernels could apply. + + // Start with the fastest kernels: AllowStrided=false, fixed input depth. + + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) + + // Next come the strided kernels: AllowStrided=true, fixed input depth. + // They are a bit less efficient, but allow stride!=1. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) + + // Finally, the kernels allowing a variable input depth, + // these are the least efficient but most general kernels. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) +#endif // USE_NEON + + // No matching fast kernel found, use slow fallback. + if (!row_accum_func) + { + row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; + } + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + + // Now that we have determined row_accum_func, we can start work. + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_rows; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + assert(thread_start >= 0); + assert(thread_end <= output_rows); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + int8_t *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_rows + row_start - row_end) * output_width * output_depth; + for (int b = batch_start; b < batch_end; ++b) + { + for (int out_y = row_start; out_y < row_end; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + const int filter_y_end = + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) + { + const int out_x_buffer_end = + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func(stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + input_offset, pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating int32_t values. Now need to convert them to + // the final 8bit form and store them. + const int num_output_values = output_depth * num_output_pixels; + + Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset, + output_activation_min, output_activation_max, acc_buffer, output_ptr); + + output_ptr += num_output_values; + } + } + output_ptr += batch_step; + } +} + +} // namespace depthwise_conv + +template <DepthwiseConvOutputRounding kOutputRounding> +inline void DepthwiseConvWithRounding(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape &bias_shape, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, + int thread_start, int thread_end, int thread_dim) +{ + const int depth_multiplier = params.depth_multiplier; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + UNUSED_RELEASE(depth_multiplier); + UNUSED_RELEASE(dilation_width_factor); + UNUSED_RELEASE(dilation_height_factor); + assert(dilation_width_factor >= 1); + assert(dilation_height_factor >= 1); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_depth = input_shape.Dims(3); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(input_depth); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + +// TODO Use below codes +#if 0 +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) +#if defined(__ANDROID__) && defined(__clang__) + CpuFlags cpu_flags; + GetCpuFlags(&cpu_flags); + const bool has_dot_product_instructions = cpu_flags.neon_dotprod; + + // Dispatch to dot-product 3x3 kernels when supported. + if (has_dot_product_instructions) + { + using optimized_ops::depthwise_conv::DotProduct3x3KernelType; + DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel< + optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>( + input_shape, filter_shape, output_shape, params, output_shift); + if (kernel_type != DotProduct3x3KernelType::kNone) + { + DepthwiseConvParams params_copy = params; + params_copy.output_shift_per_channel = output_shift; + params_copy.output_multiplier_per_channel = output_multiplier; + optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel< + DepthwiseConvImplementation::kUseNeon3x3DotProduct>( + params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data, thread_start, thread_end, thread_dim); + return; + } + } + +#endif + // Dispatch to non-dot-product 3x3 kernels when supported. + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + // Call kernel optimized for depthwise convolutions using 3x3 filters if + // parameters are supported. + if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported< + optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>( + input_shape, filter_shape, stride_width, stride_height, dilation_width_factor, + dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0, + output_shift)) + { + optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel< + DepthwiseConvOutputRounding::kUpward>( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); + return; + } +#endif + +#endif /* end of if 0 */ + + depthwise_conv::DepthwiseConvGeneral( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); +} + +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, + const int8_t *input_data, const Shape &filter_shape, + const int8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + int8_t *output_data, int thread_start, int thread_end, int thread_dim) +{ + return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); +} + +template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task +{ + DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, int thread_start, int thread_end, int thread_dim) + : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift), + input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape), + filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data), + output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start), + thread_end_(thread_end), thread_dim_(thread_dim) + { + } + + void Run() override + { + DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_, + filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_, + output_data_, thread_start_, thread_end_, thread_dim_); + } + +private: + const DepthwiseConvParams ¶ms_; + const int32_t *output_multiplier_; + const int32_t *output_shift_; + const Shape &input_shape_; + const T *input_data_; + const Shape &filter_shape_; + const T *filter_data_; + const Shape &bias_shape_; + const TS *bias_data_; + const Shape &output_shape_; + T *output_data_; + int thread_start_; + int thread_end_; + int thread_dim_; +}; + +inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim) +{ + constexpr int kMinMulPerThread = 8; + const int output_units = output_shape.Dims(thread_dim); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int num_mul_per_unit = + FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width; + const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1; + int thread_count = output_units / min_units_per_thread; + return thread_count; +} + +inline void DepthwiseConvPerChannel(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape &bias_shape, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, + ruy::Context *ruy_context) +{ + UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int output_batches = output_shape.Dims(0); + const int output_rows = output_shape.Dims(1); + int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0); + int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1); + int thread_dim, thread_count, thread_dim_size; + if (thread_count_batch > thread_count_row) + { + thread_dim = 0; + thread_dim_size = output_batches; + thread_count = thread_count_batch; + } + else + { + thread_dim = 1; + thread_dim_size = output_rows; + thread_count = thread_count_row; + } + + // NOTE Borrow RuyContext to get max_num_threads setting + // TODO Define and use max_num_threads for CPU backend + const int max_threads = ruy_context->max_num_threads(); + thread_count = std::max(1, std::min(thread_count, max_threads)); + + if (thread_count == 1) + { + DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, + /*thread_start=*/0, + /*thread_end=*/output_rows, /*thread_dim=*/1); + } + else + { + std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks; + // TODO(b/131746020) don't create new heap allocations every time. + // At least we make it a single heap allocation by using reserve(). + tasks.reserve(thread_count); + int thread_start = 0; + for (int i = 0; i < thread_count; ++i) + { + int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); + tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data, thread_start, thread_end, thread_dim); + thread_start = thread_end; + } + cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context); + } +} + +} // namespace optimized_integer_ops +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h index 93cb21e0b..96e1d9127 100644 --- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h @@ -62,7 +62,7 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap } template <typename T> -inline void BroadcastBinaryArithmeticOpSlowQuant8( +inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow( const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) @@ -72,11 +72,6 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8( NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); - if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255)) - { - throw std::runtime_error{"Support only for Quant8."}; - } - // Comment from tensorflow lite: // // In Tensorflow, the dimensions are canonically named (batch_number, row, @@ -98,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8( { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax<uint8_t>( - fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.quantized_activation_min, params.quantized_activation_max); + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>( + fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); } } } diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h index 43a5bf256..4474754af 100644 --- a/compute/cker/include/cker/operation/reference/Conv.h +++ b/compute/cker/include/cker/operation/reference/Conv.h @@ -190,6 +190,116 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 } } +inline void Conv(const ConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, int8_t *output_data) +{ + UNUSED_RELEASE(bias_shape); + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + assert(output_activation_min < output_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + + if (!is_point_inside_image) + { + continue; + } + + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = + filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // TODO(jianlijianli): Add a check to make sure the + // accumulator depth is smaller than 2^16. + acc += filter_val * (input_val + input_offset); + } + } + } + + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel], + output_shift[out_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast<int8_t>(acc); + } + } + } + } +} + } // namespace reference } // namespace cker } // namespace nnfw |